Merge remote-tracking branch 'origin/master' into lora-falcon

ggml-ci
This commit is contained in:
slaren 2023-12-16 15:41:31 +01:00
commit b7c7845277
220 changed files with 49117 additions and 18835 deletions

View file

@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@" ./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./main "$@" ./main "$@"
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
./finetune "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -34,6 +36,8 @@ else
echo " ex: --outtype f16 \"/models/7B/\" " echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml" echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
echo " See documentation for finetune for command-line parameters"
echo " --all-in-one (-a): Execute --convert & --quantize" echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B" echo " ex: \"/models/\" 7B"
echo " --server (-s): Run a model on the server" echo " --server (-s): Run a model on the server"

View file

@ -1,6 +1,9 @@
*.o *.o
*.a *.a
.cache/ .cache/
.git/
.github/
.gitignore
.vs/ .vs/
.vscode/ .vscode/
.DS_Store .DS_Store

View file

@ -15,6 +15,9 @@ indent_size = 4
[Makefile] [Makefile]
indent_style = tab indent_style = tab
[scripts/*.mk]
indent_style = tab
[prompts/*.txt] [prompts/*.txt]
insert_final_newline = unset insert_final_newline = unset

View file

@ -1,8 +1,7 @@
--- ---
name: Issue and enhancement template name: Bug template
about: Used to report issues and request enhancements for llama.cpp about: Used to report bugs in llama.cpp
title: "[User] Insert summary of your issue or enhancement.." labels: ["bug-unconfirmed"]
labels: ''
assignees: '' assignees: ''
--- ---
@ -46,7 +45,7 @@ $ g++ --version
# Failure Information (for bugs) # Failure Information (for bugs)
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. Please help provide information about the failure / bug.
# Steps to Reproduce # Steps to Reproduce

28
.github/ISSUE_TEMPLATE/enhancement.md vendored Normal file
View file

@ -0,0 +1,28 @@
---
name: Enhancement template
about: Used to request enhancements for llama.cpp
labels: ["enhancement"]
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Feature Description
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
# Motivation
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
# Possible Implementation
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

View file

@ -10,10 +10,10 @@ on:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
env: env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -143,6 +143,9 @@ jobs:
cd build cd build
ctest --verbose ctest --verbose
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
macOS-latest-make: macOS-latest-make:
runs-on: macos-latest runs-on: macos-latest
@ -160,14 +163,18 @@ jobs:
- name: Build - name: Build
id: make_build id: make_build
run: | run: |
make -j $(sysctl -n hw.logicalcpu) LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
id: make_test id: make_test
run: | run: |
make tests -j $(sysctl -n hw.logicalcpu) LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
make test -j $(sysctl -n hw.logicalcpu) LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
# TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
# would be great if we fix these
macOS-latest-cmake: macOS-latest-cmake:
runs-on: macos-latest runs-on: macos-latest
@ -188,7 +195,7 @@ jobs:
sysctl -a sysctl -a
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF .. cmake -DLLAMA_METAL=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test - name: Test
@ -253,6 +260,34 @@ jobs:
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
macOS-latest-swift:
runs-on: macos-latest
strategy:
matrix:
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
- name: Build Swift Example
id: make_build_swift_example
run: |
make swift
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest
@ -260,22 +295,23 @@ jobs:
OPENBLAS_VERSION: 0.3.23 OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17 OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0 CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.21.1-2023-04-24
strategy: strategy:
matrix: matrix:
include: include:
- build: 'noavx' - build: 'noavx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2' - build: 'avx2'
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx' - build: 'avx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512' - build: 'avx512'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast' - build: 'clblast'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas' - build: 'openblas'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps: steps:
- name: Clone - name: Clone
@ -355,11 +391,23 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
run: | run: |
cd build cd build
ctest -C Release --verbose --timeout 900 ctest -C Release --verbose --timeout 900
- name: Test (Intel SDE)
id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build
& $sde -future -- ctest -C Release --verbose --timeout 900
- name: Determine tag name - name: Determine tag name
id: tag id: tag
shell: bash shell: bash
@ -414,7 +462,7 @@ jobs:
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Determine tag name - name: Determine tag name
@ -457,6 +505,17 @@ jobs:
path: | path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
ios-xcode-build:
runs-on: macos-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
# freeBSD-latest: # freeBSD-latest:
# runs-on: macos-12 # runs-on: macos-12
# steps: # steps:

View file

@ -36,8 +36,9 @@ jobs:
poetry install poetry install
- name: Build package - name: Build package
run: poetry build run: cd gguf-py && poetry build
- name: Publish package - name: Publish package
uses: pypa/gh-action-pypi-publish@release/v1 uses: pypa/gh-action-pypi-publish@release/v1
with: with:
password: ${{ secrets.PYPI_API_TOKEN }} password: ${{ secrets.PYPI_API_TOKEN }}
packages-dir: gguf-py/dist

20
.github/workflows/python-lint.yml vendored Normal file
View file

@ -0,0 +1,20 @@
name: flake8 Lint
on: [push, pull_request]
jobs:
flake8-lint:
runs-on: ubuntu-latest
name: Lint
steps:
- name: Check out source repository
uses: actions/checkout@v3
- name: Set up Python environment
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: flake8 Lint
uses: py-actions/flake8@v2
with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
exclude: "examples/*,examples/*/**,*/**/__init__.py"

25
.github/workflows/zig-build.yml vendored Normal file
View file

@ -0,0 +1,25 @@
name: Zig CI
on:
pull_request:
push:
branches:
- master
jobs:
build:
strategy:
fail-fast: false
matrix:
runs-on: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.runs-on }}
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
fetch-depth: 0
- uses: goto-bus-stop/setup-zig@v2
with:
version: 0.11.0
- name: Build Summary
run: zig build --summary all -freference-trace

35
.gitignore vendored
View file

@ -10,9 +10,12 @@
*.gcno *.gcno
*.gcda *.gcda
*.dot *.dot
*.bat
*.metallib
.DS_Store .DS_Store
.build/ .build/
.cache/ .cache/
.ccls-cache/
.direnv/ .direnv/
.envrc .envrc
.swiftpm .swiftpm
@ -40,8 +43,11 @@ models-mnt
/embedding /embedding
/gguf /gguf
/gguf-llama-simple /gguf-llama-simple
/infill
/libllama.so /libllama.so
/llama-bench /llama-bench
/llava-cli
/lookahead
/main /main
/metal /metal
/perplexity /perplexity
@ -53,13 +59,15 @@ models-mnt
/server /server
/simple /simple
/batched /batched
/batched-bench
/export-lora /export-lora
/finetune /finetune
/speculative /speculative
/parallel /parallel
/train-text-from-scratch /train-text-from-scratch
/tokenize
/vdot /vdot
build-info.h /common/build-info.cpp
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json
CMakeSettings.json CMakeSettings.json
@ -80,14 +88,17 @@ poetry.lock
poetry.toml poetry.toml
# Test binaries # Test binaries
tests/test-grammar-parser /tests/test-grammar-parser
tests/test-llama-grammar /tests/test-llama-grammar
tests/test-double-float /tests/test-double-float
tests/test-grad0 /tests/test-grad0
tests/test-opt /tests/test-opt
tests/test-quantize-fns /tests/test-quantize-fns
tests/test-quantize-perf /tests/test-quantize-perf
tests/test-sampling /tests/test-sampling
tests/test-tokenizer-0-llama /tests/test-tokenizer-0-llama
tests/test-tokenizer-0-falcon /tests/test-tokenizer-0-falcon
tests/test-tokenizer-1 /tests/test-tokenizer-1-llama
/tests/test-tokenizer-1-bpe
/tests/test-rope
/tests/test-backend-ops

View file

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason cmake_minimum_required(VERSION 3.13) # for add_link_options
project("llama.cpp" C CXX) project("llama.cpp" C CXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -10,7 +10,7 @@ endif()
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(LLAMA_STANDALONE ON) set(LLAMA_STANDALONE ON)
# configure project version # configure project version
@ -43,8 +43,9 @@ else()
endif() endif()
# general # general
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF) option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
option(LLAMA_LTO "llama: enable link time optimization" OFF) option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug # debug
@ -58,15 +59,21 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer"
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# instruction set specific # instruction set specific
option(LLAMA_AVX "llama: enable AVX" ON) if (LLAMA_NATIVE)
option(LLAMA_AVX2 "llama: enable AVX2" ON) set(INS_ENB OFF)
option(LLAMA_AVX512 "llama: enable AVX512" OFF) else()
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) set(INS_ENB ON)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) endif()
option(LLAMA_FMA "llama: enable FMA" ON)
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
# in MSVC F16C is implied with AVX2/AVX512 # in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC) if (NOT MSVC)
option(LLAMA_F16C "llama: enable F16C" ON) option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
endif() endif()
# 3rd party libs # 3rd party libs
@ -76,6 +83,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
@ -87,46 +95,15 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON) option(LLAMA_BUILD_SERVER "llama: build server example" ON)
# # Required for relocatable CMake package
# Build info header
#
# Generate initial build-info.h
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
endif()
# Add a custom target for build-info.h
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
# Add a custom command to rebuild build-info.h when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS "${GIT_DIR}/index"
VERBATIM
)
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
endif()
# #
# Compile flags # Compile flags
# #
@ -139,6 +116,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
# enable libstdc++ assertions for debug builds
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
endif()
if (NOT MSVC) if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD) if (LLAMA_SANITIZE_THREAD)
add_compile_options(-fsanitize=thread) add_compile_options(-fsanitize=thread)
@ -188,7 +170,7 @@ if (LLAMA_METAL)
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
# copy ggml-metal.metal to bin directory # copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY} ${FOUNDATION_LIBRARY}
@ -271,13 +253,8 @@ if (LLAMA_BLAS)
endif() endif()
endif() endif()
if (LLAMA_K_QUANTS) if (LLAMA_QKK_64)
set(GGML_HEADERS_EXTRA k_quants.h) add_compile_definitions(GGML_QKK_64)
set(GGML_SOURCES_EXTRA k_quants.c)
add_compile_definitions(GGML_USE_K_QUANTS)
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
endif() endif()
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
@ -299,6 +276,9 @@ if (LLAMA_CUBLAS)
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif() endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y) if (DEFINED LLAMA_CUDA_DMMV_Y)
@ -325,6 +305,7 @@ if (LLAMA_CUBLAS)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else() else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
endif() endif()
endif() endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@ -343,8 +324,9 @@ if (LLAMA_MPI)
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI) add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
set(cxx_flags ${cxx_flags} -Wno-cast-qual) if (NOT MSVC)
set(c_flags ${c_flags} -Wno-cast-qual) add_compile_options(-Wno-cast-qual)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI # Even if you're only using the C header, C++ programs may bring in MPI
@ -397,6 +379,9 @@ if (LLAMA_HIPBLAS)
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
endif() endif()
if (LLAMA_CUDA_FORCE_MMQ)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
endif()
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@ -412,44 +397,101 @@ if (LLAMA_HIPBLAS)
endif() endif()
endif() endif()
if (LLAMA_ALL_WARNINGS) function(get_flags CCID CCVER)
if (NOT MSVC) set(C_FLAGS "")
set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) set(CXX_FLAGS "")
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
-Werror=implicit-function-declaration)
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
if (CMAKE_C_COMPILER_ID MATCHES "Clang") if (CCID MATCHES "Clang")
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return) set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi) set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
if ( if (
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
(CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0) (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
) )
set(c_flags ${c_flags} -Wdouble-promotion) set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
endif() endif()
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") elseif (CCID STREQUAL "GNU")
set(c_flags ${c_flags} -Wdouble-promotion) set(C_FLAGS -Wdouble-promotion)
set(cxx_flags ${cxx_flags} -Wno-array-bounds) set(CXX_FLAGS -Wno-array-bounds)
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0) if (CCVER VERSION_GREATER_EQUAL 7.1.0)
set(cxx_flags ${cxx_flags} -Wno-format-truncation) set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
endif() endif()
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0) if (CCVER VERSION_GREATER_EQUAL 8.1.0)
set(cxx_flags ${cxx_flags} -Wextra-semi) set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
endif()
endif() endif()
else()
# todo : msvc
endif() endif()
add_compile_options( set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
${warning_flags} set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>" endfunction()
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
)
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
set(C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-Werror=implicit-int -Werror=implicit-function-declaration)
set(CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
set(C_FLAGS ${WARNING_FLAGS} ${C_FLAGS})
set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
else()
# todo : msvc
set(C_FLAGS "")
set(CXX_FLAGS "")
endif()
endif()
if (LLAMA_CUBLAS)
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
if (NOT MSVC)
set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
endif()
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
endif()
execute_process(
COMMAND ${NVCC_CMD} -Xcompiler --version
OUTPUT_VARIABLE CUDA_CCFULLVER
ERROR_QUIET
)
if (NOT CUDA_CCFULLVER MATCHES clang)
set(CUDA_CCID "GNU")
execute_process(
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
OUTPUT_VARIABLE CUDA_CCVER
ERROR_QUIET
)
else()
if (CUDA_CCFULLVER MATCHES Apple)
set(CUDA_CCID "AppleClang")
else()
set(CUDA_CCID "Clang")
endif()
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
endif()
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
get_flags(${CUDA_CCID} ${CUDA_CCVER})
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS STREQUAL "")
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
endif()
endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
endif() endif()
if (WIN32) if (WIN32)
@ -470,6 +512,16 @@ if (LLAMA_LTO)
endif() endif()
endif() endif()
# this version of Apple ld64 is buggy
execute_process(
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
ERROR_VARIABLE output
OUTPUT_QUIET
)
if (output MATCHES "dyld-1015\.7")
add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
endif()
# Architecture specific # Architecture specific
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
@ -491,9 +543,6 @@ if (NOT MSVC)
if (LLAMA_GPROF) if (LLAMA_GPROF)
add_compile_options(-pg) add_compile_options(-pg)
endif() endif()
if (LLAMA_NATIVE)
add_compile_options(-march=native)
endif()
endif() endif()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
@ -525,6 +574,10 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
message(STATUS "x86 detected") message(STATUS "x86 detected")
if (MSVC) if (MSVC)
# instruction set detection for MSVC only
if (LLAMA_NATIVE)
include(cmake/FindSIMD.cmake)
endif ()
if (LLAMA_AVX512) if (LLAMA_AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>) add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>) add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
@ -548,6 +601,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>) add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
endif() endif()
else() else()
if (LLAMA_NATIVE)
add_compile_options(-march=native)
endif()
if (LLAMA_F16C) if (LLAMA_F16C)
add_compile_options(-mf16c) add_compile_options(-mf16c)
endif() endif()
@ -573,12 +629,21 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
add_compile_options(-mcpu=native -mtune=native) if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) add_compile_options(-mcpu=powerpc64le)
else()
add_compile_options(-mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif()
else() else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=0x602)
endif()
# #
# POSIX conformance # POSIX conformance
# #
@ -644,11 +709,15 @@ add_library(ggml OBJECT
ggml.h ggml.h
ggml-alloc.c ggml-alloc.c
ggml-alloc.h ggml-alloc.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ggml-backend.c
ggml-backend.h
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
) )
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
@ -705,6 +774,7 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
configure_package_config_file( configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in

304
Makefile
View file

@ -1,8 +1,15 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative benchmark-matmult parallel finetune export-lora tests/test-c.o BUILD_TARGETS = \
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
tests/test-backend-ops
# Code coverage output files # Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -19,20 +26,6 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m) UNAME_M := $(shell uname -m)
endif endif
ifeq '' '$(findstring clang,$(shell $(CC) --version))'
CC_IS_GCC=1
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
else
CC_IS_CLANG=1
ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
CC_IS_LLVM_CLANG=1
else
CC_IS_APPLE_CLANG=1
endif
CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
endif
# Mac OS + Arm can report x86_64 # Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
@ -62,9 +55,11 @@ test: $(TEST_TARGETS)
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \ if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
continue; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \ continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
continue; \
else \ else \
echo "Running test $$test_target..."; \ echo "Running test $$test_target..."; \
./$$test_target; \ ./$$test_target; \
@ -112,12 +107,12 @@ MK_CXXFLAGS = -std=c++11 -fPIC
# -Ofast tends to produce faster code, but may not be available for some compilers. # -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST ifdef LLAMA_FAST
MK_CFLAGS += -Ofast MK_CFLAGS += -Ofast
MK_HOST_CXXFLAGS += -Ofast HOST_CXXFLAGS += -Ofast
MK_CUDA_CXXFLAGS += -O3 MK_NVCCFLAGS += -O3
else else
MK_CFLAGS += -O3 MK_CFLAGS += -O3
MK_CXXFLAGS += -O3 MK_CXXFLAGS += -O3
endif endif
# clock_gettime came in POSIX.1b (1993) # clock_gettime came in POSIX.1b (1993)
@ -166,10 +161,32 @@ ifdef LLAMA_DEBUG
MK_CFLAGS += -O0 -g MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g
MK_LDFLAGS += -g MK_LDFLAGS += -g
ifeq ($(UNAME_S),Linux)
MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
endif
else else
MK_CPPFLAGS += -DNDEBUG MK_CPPFLAGS += -DNDEBUG
endif endif
ifdef LLAMA_SANITIZE_THREAD
MK_CFLAGS += -fsanitize=thread -g
MK_CXXFLAGS += -fsanitize=thread -g
MK_LDFLAGS += -fsanitize=thread -g
endif
ifdef LLAMA_SANITIZE_ADDRESS
MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
endif
ifdef LLAMA_SANITIZE_UNDEFINED
MK_CFLAGS += -fsanitize=undefined -g
MK_CXXFLAGS += -fsanitize=undefined -g
MK_LDFLAGS += -fsanitize=undefined -g
endif
ifdef LLAMA_SERVER_VERBOSE ifdef LLAMA_SERVER_VERBOSE
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif endif
@ -189,28 +206,9 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
-Werror=implicit-function-declaration -Werror=implicit-function-declaration
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
ifeq ($(CC_IS_CLANG), 1) # this version of Apple ld64 is buggy
# clang options ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
MK_CFLAGS += -Wdouble-promotion
endif
ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
MK_CFLAGS += -Wdouble-promotion
endif
else
# gcc options
MK_CFLAGS += -Wdouble-promotion
MK_HOST_CXXFLAGS += -Wno-array-bounds
ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
MK_HOST_CXXFLAGS += -Wno-format-truncation
endif
ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
MK_HOST_CXXFLAGS += -Wextra-semi
endif
endif endif
# OS specific # OS specific
@ -258,8 +256,8 @@ ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
MK_CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
MK_HOST_CXXFLAGS += -march=native -mtune=native HOST_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only # Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
@ -270,12 +268,15 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
#MK_CXXFLAGS += -mssse3 #MK_CXXFLAGS += -mssse3
endif endif
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
# Target Windows 8 for PrefetchVirtualMemory
MK_CPPFLAGS += -D_WIN32_WINNT=0x602
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),) ifneq ($(filter aarch64%,$(UNAME_M)),)
@ -311,18 +312,20 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif endif
endif endif
ifneq ($(filter ppc64le%,$(UNAME_M)),)
MK_CFLAGS += -mcpu=powerpc64le
MK_CXXFLAGS += -mcpu=powerpc64le
CUDA_POWER_ARCH = 1
endif
else else
MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
ifndef LLAMA_NO_K_QUANTS
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
ifdef LLAMA_QKK_64 ifdef LLAMA_QKK_64
MK_CPPFLAGS += -DGGML_QKK_64 MK_CPPFLAGS += -DGGML_QKK_64
endif endif
endif
ifndef LLAMA_NO_ACCELERATE ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework. # Mac OS - include Accelerate framework.
@ -339,7 +342,7 @@ ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o OBJS += ggml-mpi.o
endif # LLAMA_MPI endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
@ -356,57 +359,65 @@ endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math MK_NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif
ifdef LLAMA_CUDA_NVCC ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC) NVCC = $(LLAMA_CUDA_NVCC)
else else
NVCC = nvcc NVCC = nvcc
endif #LLAMA_CUDA_NVCC endif #LLAMA_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else else ifndef CUDA_POWER_ARCH
NVCCFLAGS += -arch=native MK_NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV ifdef LLAMA_CUDA_FORCE_DMMV
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV
ifdef LLAMA_CUDA_FORCE_MMQ
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # LLAMA_CUDA_FORCE_MMQ
ifdef LLAMA_CUDA_DMMV_X ifdef LLAMA_CUDA_DMMV_X
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
else else
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
endif # LLAMA_CUDA_DMMV_X endif # LLAMA_CUDA_DMMV_X
ifdef LLAMA_CUDA_MMV_Y ifdef LLAMA_CUDA_MMV_Y
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
else ifdef LLAMA_CUDA_DMMV_Y else ifdef LLAMA_CUDA_DMMV_Y
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
else else
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
endif # LLAMA_CUDA_MMV_Y endif # LLAMA_CUDA_MMV_Y
ifdef LLAMA_CUDA_F16 ifdef LLAMA_CUDA_F16
NVCCFLAGS += -DGGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # LLAMA_CUDA_F16 endif # LLAMA_CUDA_F16
ifdef LLAMA_CUDA_DMMV_F16 ifdef LLAMA_CUDA_DMMV_F16
NVCCFLAGS += -DGGML_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # LLAMA_CUDA_DMMV_F16 endif # LLAMA_CUDA_DMMV_F16
ifdef LLAMA_CUDA_KQUANTS_ITER ifdef LLAMA_CUDA_KQUANTS_ITER
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
else else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif endif
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE) MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
else else
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
#ifdef LLAMA_CUDA_CUBLAS #ifdef LLAMA_CUDA_CUBLAS
# NVCCFLAGS += -DGGML_CUDA_CUBLAS # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
#endif # LLAMA_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS
ifdef LLAMA_CUDA_CCBIN ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) -c $< -o $@ $(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
@ -468,21 +479,22 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI endif # LLAMA_MPI
ifndef LLAMA_NO_K_QUANTS GF_CC := $(CC)
k_quants.o: k_quants.c k_quants.h include scripts/get-flags.mk
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS) override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS) override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# save CXXFLAGS before we add host-only options # identify CUDA host compiler
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)" ifdef LLAMA_CUBLAS
override CXXFLAGS += $(HOST_CXXFLAGS) GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
include scripts/get-flags.mk
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
endif
# #
# Print build information # Print build information
@ -510,12 +522,24 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
$(CC) $(CFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common.o: common/common.cpp common/common.h build-info.h common/log.h COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
console.o: common/console.cpp common/console.h console.o: common/console.cpp common/console.h
@ -531,77 +555,88 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
# #
# Examples # Examples
# #
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS) main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo @echo
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@echo @echo
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o common.o $(OBJS) simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS) libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS) finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS) export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS) parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL ifdef LLAMA_METAL
@ -609,7 +644,12 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif endif
build-info.h: $(wildcard .git/index) scripts/build-info.sh ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
(cd examples/batched.swift; make build)
endif
common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh $(CC) > $@.tmp @sh scripts/build-info.sh $(CC) > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \ mv $@.tmp $@; \
@ -617,19 +657,22 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
rm $@.tmp; \ rm $@.tmp; \
fi fi
build-info.o: common/build-info.cpp
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
# #
# Tests # Tests
# #
tests: $(TEST_TARGETS) tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
run-benchmark-matmult: benchmark-matmult run-benchmark-matmult: benchmark-matmult
./$@ ./$@
.PHONY: run-benchmark-matmult .PHONY: run-benchmark-matmult swift
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@ -637,38 +680,47 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h tests/test-c.o: tests/test-c.c llama.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

View file

@ -1,31 +1,15 @@
// swift-tools-version:5.3 // swift-tools-version:5.5
import PackageDescription import PackageDescription
#if arch(arm) || arch(arm64)
let platforms: [SupportedPlatform]? = [
.macOS(.v11),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
]
let exclude: [String] = []
let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
let additionalSettings: [CSetting] = [
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_SWIFT"),
.define("GGML_USE_METAL")
]
#else
let platforms: [SupportedPlatform]? = nil
let exclude: [String] = ["ggml-metal.metal"]
let additionalSources: [String] = []
let additionalSettings: [CSetting] = []
#endif
let package = Package( let package = Package(
name: "llama", name: "llama",
platforms: platforms, platforms: [
.macOS(.v12),
.iOS(.v14),
.watchOS(.v4),
.tvOS(.v14)
],
products: [ products: [
.library(name: "llama", targets: ["llama"]), .library(name: "llama", targets: ["llama"]),
], ],
@ -33,21 +17,30 @@ let package = Package(
.target( .target(
name: "llama", name: "llama",
path: ".", path: ".",
exclude: exclude, exclude: [],
sources: [ sources: [
"ggml.c", "ggml.c",
"llama.cpp", "llama.cpp",
"ggml-alloc.c", "ggml-alloc.c",
"k_quants.c", "ggml-backend.c",
] + additionalSources, "ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
],
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [ cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]), .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE"), .define("GGML_USE_ACCELERATE"),
.define("ACCELERATE_NEW_LAPACK"), .unsafeFlags(["-fno-objc-arc"]),
.define("ACCELERATE_LAPACK_ILP64") .define("GGML_USE_METAL"),
] + additionalSettings, // NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
],
linkerSettings: [ linkerSettings: [
.linkedFramework("Accelerate") .linkedFramework("Accelerate")
] ]

107
README.md
View file

@ -2,20 +2,19 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ - Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
**Devs should become familiar with the new API** - **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
- Local Falcon 180B inference on Mac Studio - Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
---- ----
@ -88,17 +87,35 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy) - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b) - [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] Mistral AI v0.1 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
**Multimodal models:**
- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
**Bindings:** **Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@ -112,6 +129,8 @@ as the main playground for developing new features for the [ggml](https://github
- [nat/openplayground](https://github.com/nat/openplayground) - [nat/openplayground](https://github.com/nat/openplayground)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
- [withcatai/catai](https://github.com/withcatai/catai) - [withcatai/catai](https://github.com/withcatai/catai)
- [semperai/amica](https://github.com/semperai/amica)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
--- ---
@ -201,7 +220,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
## Usage ## Usage
Here are the steps for the LLaMA-7B model. Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model.
### Get the Code ### Get the Code
@ -274,7 +293,7 @@ In order to build llama.cpp you have three different options.
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU. On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option. To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument. argument.
### MPI Build ### MPI Build
@ -318,7 +337,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
### BLAS Build ### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it: Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
- #### Accelerate Framework: - #### Accelerate Framework:
@ -375,7 +394,7 @@ Building the program with BLAS support may lead to some performance improvements
- #### cuBLAS - #### cuBLAS
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
- Using `make`: - Using `make`:
```bash ```bash
make LLAMA_CUBLAS=1 make LLAMA_CUBLAS=1
@ -408,22 +427,31 @@ Building the program with BLAS support may lead to some performance improvements
This provides BLAS acceleration on HIP-supported AMD GPUs. This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed. Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html). You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
Windows support is coming soon...
- Using `make`: - Using `make`:
```bash ```bash
make LLAMA_HIPBLAS=1 make LLAMA_HIPBLAS=1
``` ```
- Using `CMake`: - Using `CMake` for Linux:
```bash ```bash
mkdir build mkdir build
cd build cd build
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
cmake --build . cmake --build .
``` ```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
mkdir build
cd build
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
cmake --build .
```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3. If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |
@ -568,6 +596,18 @@ python3 convert.py models/7B/
When running the larger models, make sure you have enough disk space to store all the intermediate files. When running the larger models, make sure you have enough disk space to store all the intermediate files.
### Running on Windows with prebuilt binaries
You will find prebuilt Windows binaries on the release page.
Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
```
.\main -m llama-2-7b.Q4_0.gguf -n 128
```
### Memory/Disk Requirements ### Memory/Disk Requirements
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
@ -611,6 +651,18 @@ For more information, see [https://huggingface.co/docs/transformers/perplexity](
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512. The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads. The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
#### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output:
```
perplexity : calculating perplexity over 655 chunks
24.43 seconds per pass - ETA 4.45 hours
[1]4.5970,[2]5.1807,[3]6.0382,...
```
And after 4.45 hours, you will have the final perplexity.
### Interactive mode ### Interactive mode
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter. If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
@ -662,6 +714,8 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruction mode with Alpaca ### Instruction mode with Alpaca
1. First, download the `ggml` Alpaca model into the `./models` folder 1. First, download the `ggml` Alpaca model into the `./models` folder
@ -771,18 +825,6 @@ If your issue is with model generation quality, then please at least scan the fo
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following) - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
#### How to run
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
3. Output:
```
perplexity : calculating perplexity over 655 chunks
24.43 seconds per pass - ETA 4.45 hours
[1]4.5970,[2]5.1807,[3]6.0382,...
```
And after 4.45 hours, you will have the final perplexity.
### Android ### Android
#### Building the Project using Android NDK #### Building the Project using Android NDK
@ -867,7 +909,7 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
#### Usage #### Usage
@ -945,7 +987,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
- [main](./examples/main/README.md) - [main](./examples/main/README.md)
- [server](./examples/server/README.md) - [server](./examples/server/README.md)
- [embd-input](./examples/embd-input/README.md)
- [jeopardy](./examples/jeopardy/README.md) - [jeopardy](./examples/jeopardy/README.md)
- [BLIS](./docs/BLIS.md) - [BLIS](./docs/BLIS.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md) - [Performance troubleshooting](./docs/token_generation_performance_tips.md)

View file

@ -10,7 +10,6 @@ const Maker = struct {
builder: *std.build.Builder, builder: *std.build.Builder,
target: CrossTarget, target: CrossTarget,
optimize: Mode, optimize: Mode,
config_header: *ConfigHeader,
enable_lto: bool, enable_lto: bool,
include_dirs: ArrayList([]const u8), include_dirs: ArrayList([]const u8),
@ -36,28 +35,29 @@ const Maker = struct {
} }
fn init(builder: *std.build.Builder) !Maker { fn init(builder: *std.build.Builder) !Maker {
// const commit_hash = @embedFile(".git/refs/heads/master");
const target = builder.standardTargetOptions(.{}); const target = builder.standardTargetOptions(.{});
const config_header = builder.addConfigHeader( const zig_version = @import("builtin").zig_version_string;
.{ .style = .blank, .include_path = "build-info.h" }, const commit_hash = try std.ChildProcess.exec(
.{ .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
.BUILD_NUMBER = 0,
.BUILD_COMMIT = "12345", // omit newline
.BUILD_COMPILER = "Zig 0.11.0",
.BUILD_TARGET = try target.allocDescription(builder.allocator),
},
); );
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
\\int LLAMA_BUILD_NUMBER = {};
\\char const *LLAMA_COMMIT = "{s}";
\\char const *LLAMA_COMPILER = "Zig {s}";
\\char const *LLAMA_BUILD_TARGET = "{s}";
\\
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
var m = Maker{ var m = Maker{
.builder = builder, .builder = builder,
.target = target, .target = target,
.optimize = builder.standardOptimizeOption(.{}), .optimize = builder.standardOptimizeOption(.{}),
.config_header = config_header,
.enable_lto = false, .enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator), .include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator), .cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator), .cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator), .objs = ArrayList(*Compile).init(builder.allocator),
}; };
try m.addCFlag("-std=c11"); try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11"); try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{}); try m.addProjectInclude(&.{});
@ -67,14 +67,21 @@ const Maker = struct {
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile { fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (o.target.getAbi() != .msvc)
o.defineCMacro("_GNU_SOURCE", null);
if (std.mem.endsWith(u8, src, ".c")) { if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items); o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC(); o.linkLibC();
} else { } else {
o.addCSourceFiles(&.{src}, m.cxxflags.items); o.addCSourceFiles(&.{src}, m.cxxflags.items);
o.linkLibCpp(); if (o.target.getAbi() == .msvc) {
o.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
o.linkLibCpp();
}
} }
o.addConfigHeader(m.config_header);
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i }); for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto; o.want_lto = m.enable_lto;
return o; return o;
@ -86,9 +93,14 @@ const Maker = struct {
for (deps) |d| e.addObject(d); for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o); for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i }); for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
e.linkLibC();
e.linkLibCpp(); // https://github.com/ziglang/zig/issues/15448
e.addConfigHeader(m.config_header); if (e.target.getAbi() == .msvc) {
e.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
e.linkLibCpp();
}
m.builder.installArtifact(e); m.builder.installArtifact(e);
e.want_lto = m.enable_lto; e.want_lto = m.enable_lto;
return e; return e;
@ -99,26 +111,27 @@ pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b); var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false; make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
try make.addFlag("-DGGML_USE_K_QUANTS");
const k_quants = make.obj("k_quants", "k_quants.c");
try make.objs.append(k_quants);
}
const ggml = make.obj("ggml", "ggml.c"); const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const llama = make.obj("llama", "llama.cpp"); const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp"); const common = make.obj("common", "common/common.cpp");
const console = make.obj("common", "common/console.cpp"); const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser }); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
if (server.target.isWindows()) { if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32"); server.linkSystemLibrary("ws2_32");
} }

View file

@ -208,6 +208,8 @@ function gg_run_open_llama_3b_v2 {
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@ -296,6 +298,7 @@ function gg_sum_open_llama_3b_v2 {
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@ -382,6 +385,8 @@ function gg_run_open_llama_7b_v2 {
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@ -470,6 +475,7 @@ function gg_sum_open_llama_7b_v2 {
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@ -496,10 +502,12 @@ test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
if [ -z ${GG_BUILD_CUDA} ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2 if [ -z ${GG_BUILD_CUDA} ]; then
else test $ret -eq 0 && gg_run open_llama_3b_v2
test $ret -eq 0 && gg_run open_llama_7b_v2 else
test $ret -eq 0 && gg_run open_llama_7b_v2
fi
fi fi
fi fi

100
cmake/FindSIMD.cmake Normal file
View file

@ -0,0 +1,100 @@
include(CheckCSourceRuns)
set(AVX_CODE "
#include <immintrin.h>
int main()
{
__m256 a;
a = _mm256_set1_ps(0);
return 0;
}
")
set(AVX512_CODE "
#include <immintrin.h>
int main()
{
__m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0);
__m512i b = a;
__mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
return 0;
}
")
set(AVX2_CODE "
#include <immintrin.h>
int main()
{
__m256i a = {0};
a = _mm256_abs_epi16(a);
__m256i x;
_mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
return 0;
}
")
set(FMA_CODE "
#include <immintrin.h>
int main()
{
__m256 acc = _mm256_setzero_ps();
const __m256 d = _mm256_setzero_ps();
const __m256 p = _mm256_setzero_ps();
acc = _mm256_fmadd_ps( d, p, acc );
return 0;
}
")
macro(check_sse type flags)
set(__FLAG_I 1)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
foreach (__FLAG ${flags})
if (NOT ${type}_FOUND)
set(CMAKE_REQUIRED_FLAGS ${__FLAG})
check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
if (HAS_${type}_${__FLAG_I})
set(${type}_FOUND TRUE CACHE BOOL "${type} support")
set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
endif()
math(EXPR __FLAG_I "${__FLAG_I}+1")
endif()
endforeach()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
if (NOT ${type}_FOUND)
set(${type}_FOUND FALSE CACHE BOOL "${type} support")
set(${type}_FLAGS "" CACHE STRING "${type} flags")
endif()
mark_as_advanced(${type}_FOUND ${type}_FLAGS)
endmacro()
# flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND})
set(LLAMA_AVX OFF)
else()
set(LLAMA_AVX ON)
endif()
check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
set(LLAMA_AVX2 OFF)
else()
set(LLAMA_AVX2 ON)
endif()
check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND})
set(LLAMA_AVX512 OFF)
else()
set(LLAMA_AVX512 ON)
endif()

View file

@ -1,10 +1,56 @@
# common # common
# Build info header
#
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
if (SLASH_POS EQUAL 0)
set(GIT_DIR "${REAL_GIT_DIR}")
else()
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
endif()
endif()
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")
endif()
# Add a custom command to rebuild build-info.cpp when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
)
set(TARGET build_info)
add_library(${TARGET} OBJECT build-info.cpp)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(TARGET common) set(TARGET common)
add_library(${TARGET} OBJECT add_library(${TARGET} STATIC
base64.hpp
common.h common.h
common.cpp common.cpp
sampling.h
sampling.cpp
console.h console.h
console.cpp console.cpp
grammar-parser.h grammar-parser.h
@ -19,4 +65,4 @@ endif()
target_include_directories(${TARGET} PUBLIC .) target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11) target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama) target_link_libraries(${TARGET} PRIVATE llama build_info)

392
common/base64.hpp Normal file
View file

@ -0,0 +1,392 @@
/*
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org>
*/
#ifndef PUBLIC_DOMAIN_BASE64_HPP_
#define PUBLIC_DOMAIN_BASE64_HPP_
#include <cstdint>
#include <iterator>
#include <stdexcept>
#include <string>
class base64_error : public std::runtime_error
{
public:
using std::runtime_error::runtime_error;
};
class base64
{
public:
enum class alphabet
{
/** the alphabet is detected automatically */
auto_,
/** the standard base64 alphabet is used */
standard,
/** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
url_filename_safe
};
enum class decoding_behavior
{
/** if the input is not padded, the remaining bits are ignored */
moderate,
/** if a padding character is encounter decoding is finished */
loose
};
/**
Encodes all the elements from `in_begin` to `in_end` to `out`.
@warning The source and destination cannot overlap. The destination must be able to hold at least
`required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
@tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
8 bits
@tparam Output_iterator the destination; the elements written to it are from the type `char`
@param in_begin the beginning of the source
@param in_end the ending of the source
@param out the destination iterator
@param alphabet which alphabet should be used
@returns the iterator to the next element past the last element copied
@throws see `Input_iterator` and `Output_iterator`
*/
template<typename Input_iterator, typename Output_iterator>
static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
alphabet alphabet = alphabet::standard)
{
constexpr auto pad = '=';
const char* alpha = alphabet == alphabet::url_filename_safe
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
while (in_begin != in_end) {
std::uint8_t i0 = 0, i1 = 0, i2 = 0;
// first character
i0 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[i0 >> 2 & 0x3f];
++out;
// part of first character and second
if (in_begin != in_end) {
i1 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
++out;
} else {
*out = alpha[(i0 & 0x3) << 4];
++out;
// last padding
*out = pad;
++out;
// last padding
*out = pad;
++out;
break;
}
// part of second character and third
if (in_begin != in_end) {
i2 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
++out;
} else {
*out = alpha[(i1 & 0xf) << 2];
++out;
// last padding
*out = pad;
++out;
break;
}
// rest of third
*out = alpha[i2 & 0x3f];
++out;
}
return out;
}
/**
Encodes a string.
@param str the string that should be encoded
@param alphabet which alphabet should be used
@returns the encoded base64 string
@throws see base64::encode()
*/
static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
{
std::string result;
result.reserve(required_encode_size(str.length()) + 1);
encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
return result;
}
/**
Encodes a char array.
@param buffer the char array
@param size the size of the array
@param alphabet which alphabet should be used
@returns the encoded string
*/
static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
{
std::string result;
result.reserve(required_encode_size(size) + 1);
encode(buffer, buffer + size, std::back_inserter(result), alphabet);
return result;
}
/**
Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
in other words: inplace decoding is possible.
@warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
otherwise the behavior depends on the output iterator.
@tparam Input_iterator the source; the returned elements are cast to `char`
@tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
@param in_begin the beginning of the source
@param in_end the ending of the source
@param out the destination iterator
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the iterator to the next element past the last element copied
@throws base64_error depending on the set behavior
@throws see `Input_iterator` and `Output_iterator`
*/
template<typename Input_iterator, typename Output_iterator>
static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
//constexpr auto pad = '=';
std::uint8_t last = 0;
auto bits = 0;
while (in_begin != in_end) {
auto c = *in_begin;
++in_begin;
if (c == '=') {
break;
}
auto part = _base64_value(alphabet, c);
// enough bits for one byte
if (bits + 6 >= 8) {
*out = (last << (8 - bits)) | (part >> (bits - 2));
++out;
bits -= 2;
} else {
bits += 6;
}
last = part;
}
// check padding
if (behavior != decoding_behavior::loose) {
while (in_begin != in_end) {
auto c = *in_begin;
++in_begin;
if (c != '=') {
throw base64_error("invalid base64 character.");
}
}
}
return out;
}
/**
Decodes a string.
@param str the base64 encoded string
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the decoded string
@throws see base64::decode()
*/
static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
std::string result;
result.reserve(max_decode_size(str.length()));
decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
return result;
}
/**
Decodes a string.
@param buffer the base64 encoded buffer
@param size the size of the buffer
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the decoded string
@throws see base64::decode()
*/
static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
std::string result;
result.reserve(max_decode_size(size));
decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
return result;
}
/**
Decodes a string inplace.
@param[in,out] str the base64 encoded string
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@throws base64::decode_inplace()
*/
static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
}
/**
Decodes a char array inplace.
@param[in,out] str the string array
@param size the length of the array
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the pointer to the next element past the last element decoded
@throws base64::decode_inplace()
*/
static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
return decode(str, str + size, str, alphabet, behavior);
}
/**
Returns the required decoding size for a given size. The value is calculated with the following formula:
$$
\lceil \frac{size}{4} \rceil \cdot 3
$$
@param size the size of the encoded input
@returns the size of the resulting decoded buffer; this the absolute maximum
*/
static std::size_t max_decode_size(std::size_t size) noexcept
{
return (size / 4 + (size % 4 ? 1 : 0)) * 3;
}
/**
Returns the required encoding size for a given size. The value is calculated with the following formula:
$$
\lceil \frac{size}{3} \rceil \cdot 4
$$
@param size the size of the decoded input
@returns the size of the resulting encoded buffer
*/
static std::size_t required_encode_size(std::size_t size) noexcept
{
return (size / 3 + (size % 3 ? 1 : 0)) * 4;
}
private:
static std::uint8_t _base64_value(alphabet& alphabet, char c)
{
if (c >= 'A' && c <= 'Z') {
return c - 'A';
} else if (c >= 'a' && c <= 'z') {
return c - 'a' + 26;
} else if (c >= '0' && c <= '9') {
return c - '0' + 52;
}
// comes down to alphabet
if (alphabet == alphabet::standard) {
if (c == '+') {
return 62;
} else if (c == '/') {
return 63;
}
} else if (alphabet == alphabet::url_filename_safe) {
if (c == '-') {
return 62;
} else if (c == '_') {
return 63;
}
} // auto detect
else {
if (c == '+') {
alphabet = alphabet::standard;
return 62;
} else if (c == '/') {
alphabet = alphabet::standard;
return 63;
} else if (c == '-') {
alphabet = alphabet::url_filename_safe;
return 62;
} else if (c == '_') {
alphabet = alphabet::url_filename_safe;
return 63;
}
}
throw base64_error("invalid base64 character.");
}
};
#endif // !PUBLIC_DOMAIN_BASE64_HPP_

4
common/build-info.cpp.in Normal file
View file

@ -0,0 +1,4 @@
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";

File diff suppressed because it is too large Load diff

View file

@ -4,9 +4,12 @@
#include "llama.h" #include "llama.h"
#include "sampling.h"
#define LOG_NO_FILE_LINE_FUNCTION #define LOG_NO_FILE_LINE_FUNCTION
#include "log.h" #include "log.h"
#include <cmath>
#include <string> #include <string>
#include <vector> #include <vector>
#include <random> #include <random>
@ -23,69 +26,69 @@
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \ #define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0) } while(0)
// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;
// //
// CLI argument parsing // CLI argument parsing
// //
int32_t get_num_physical_cores(); int32_t get_num_physical_cores();
struct gpt_params { struct gpt_params {
uint32_t seed = -1; // RNG seed uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores(); int32_t n_threads = get_num_physical_cores();
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode int32_t n_sequences = 1; // number of sequences to decode
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) float p_accept = 0.5f; // speculative decoding accept probability
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) float p_split = 0.1f; // speculative decoding split probability
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t n_beams = 0; // if non-zero then use beam search of given width. float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
float rope_freq_base = 0.0f; // RoPE base frequency int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
// pinging @cebtenzzre
// sampling parameters // // sampling parameters
int32_t top_k = 40; // <= 0 to use vocab size struct llama_sampling_params sparams;
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias
std::string prompt = ""; std::string prompt = "";
std::string prompt_file = ""; // store the external prompt file name
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with std::string input_suffix = ""; // string to suffix user inputs with
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files std::string logdir = ""; // directory in which to save YAML log files
std::vector<llama_model_kv_override> kv_overrides;
// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter std::string lora_base = ""; // base model path for the lora adapter
@ -93,14 +96,14 @@ struct gpt_params {
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting) // (which is more convenient to use for plotting)
// //
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@ -114,14 +117,25 @@ struct gpt_params {
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models) bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token
bool logits_all = false; // return logits for all tokens in the batch bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool numa = false; // attempt optimizations that help on some NUMA systems bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file
}; };
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params); bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
@ -132,14 +146,33 @@ std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input); void process_escapes(std::string& input);
//
// String parsing
//
std::string parse_samplers_input(std::string input);
// //
// Model utils // Model utils
// //
// TODO: avoid tuplue, use struct
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params); std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
// Batch utils
void llama_batch_clear(struct llama_batch & batch);
void llama_batch_add(
struct llama_batch & batch,
llama_token id,
llama_pos pos,
const std::vector<llama_seq_id> & seq_ids,
bool logits);
// //
// Vocab utils // Vocab utils
// //
@ -149,12 +182,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx, const struct llama_context * ctx,
const std::string & text, const std::string & text,
bool add_bos); bool add_bos,
bool special = false);
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
const struct llama_model * model, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_bos); bool add_bos,
bool special = false);
// tokenizes a token into a piece // tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece` // should work similar to Python's `tokenizer.id_to_piece`
@ -178,35 +213,9 @@ std::string llama_detokenize_bpe(
llama_context * ctx, llama_context * ctx,
const std::vector<llama_token> & tokens); const std::vector<llama_token> & tokens);
// // Uses the value from the model metadata if possible, otherwise
// Sampling utils // defaults to true when model type is SPM, otherwise false.
// bool llama_should_add_bos_token(const llama_model * model);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
//
// required:
// - ctx: context to use for sampling
// - params: sampling parameters
//
// optional:
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - grammar: grammar to use for sampling, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits_ith(ctx, idx)
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx = 0);
// //
// YAML utils // YAML utils
@ -221,3 +230,13 @@ std::string get_sortable_timestamp();
void dump_non_result_info_yaml( void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx, FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc); const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
//
// KV cache utils
//
// Dump the KV cache view with the number of sequences per cell.
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

View file

@ -190,7 +190,7 @@ namespace grammar_parser {
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
if (last_sym_start == out_elements.size()) { if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos); throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
} }
// apply transformation to previous symbol (last_sym_start to end) according to // apply transformation to previous symbol (last_sym_start to end) according to
@ -399,7 +399,7 @@ namespace grammar_parser {
void print_grammar(FILE * file, const parse_state & state) { void print_grammar(FILE * file, const parse_state & state) {
try { try {
std::map<uint32_t, std::string> symbol_id_names; std::map<uint32_t, std::string> symbol_id_names;
for (auto kv : state.symbol_ids) { for (const auto & kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first; symbol_id_names[kv.second] = kv.first;
} }
for (size_t i = 0, end = state.rules.size(); i < end; i++) { for (size_t i = 0, end = state.rules.size(); i < end; i++) {

View file

@ -61,13 +61,13 @@
// #define LOG_TARGET stderr // #define LOG_TARGET stderr
// #include "log.h" // #include "log.h"
// //
// The log target can also be redirected to a diffrent function // The log target can also be redirected to a different function
// like so: // like so:
// //
// #define LOG_TARGET log_handler_diffrent() // #define LOG_TARGET log_handler_different()
// #include "log.h" // #include "log.h"
// //
// FILE* log_handler_diffrent() // FILE* log_handler_different()
// { // {
// return stderr; // return stderr;
// } // }
@ -97,37 +97,56 @@
#define LOG_TEE_TARGET stderr #define LOG_TEE_TARGET stderr
#endif #endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// Utility to obtain "pid" like unique process id and use it when creating log files. // Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid() inline std::string log_get_pid()
{ {
static std::string pid; static std::string pid;
if (pid.empty()) if (pid.empty())
{ {
// std::this_thread::get_id() is the most portable way of obtaining a "process id" // std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances // it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log. // trying to write to the same log.
std::stringstream ss; std::stringstream ss;
ss << std::this_thread::get_id(); ss << std::this_thread::get_id();
pid = ss.str(); pid = ss.str();
} }
return pid; return pid;
} }
// Utility function for generating log file names with unique id based on thread id. // Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log" // invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread. // where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension) #define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension) inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
{ {
static bool _multilog = false;
if (multilog != LogTriStateSame)
{
_multilog = multilog == LogTriStateTrue;
}
std::stringstream buf; std::stringstream buf;
buf << log_file_basename; buf << log_file_basename;
buf << "."; if (_multilog)
buf << log_get_pid(); {
buf << ".";
buf << log_get_pid();
}
buf << "."; buf << ".";
buf << log_file_extension; buf << log_file_extension;
@ -212,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
#define LOG_TEE_FLF_VAL ,"" #define LOG_TEE_FLF_VAL ,""
#endif #endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG() INSTEAD // USE LOG() INSTEAD
// //
@ -314,16 +324,23 @@ enum LogTriState
#endif #endif
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{ {
static bool _initialized{false}; static bool _initialized = false;
static bool _disabled{(filename.empty() && target == nullptr)}; static bool _append = false;
static bool _disabled = filename.empty() && target == nullptr;
static std::string log_current_filename{filename}; static std::string log_current_filename{filename};
static FILE *log_current_target{target}; static FILE *log_current_target{target};
static FILE *logfile = nullptr; static FILE *logfile = nullptr;
if (change) if (change)
{ {
if (append != LogTriStateSame)
{
_append = append == LogTriStateTrue;
return logfile;
}
if (disable == LogTriStateTrue) if (disable == LogTriStateTrue)
{ {
// Disable primary target // Disable primary target
@ -376,7 +393,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
} }
} }
logfile = fopen(filename.c_str(), "w"); logfile = fopen(filename.c_str(), _append ? "a" : "w");
} }
if (!logfile) if (!logfile)
@ -397,20 +414,20 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
} }
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{ {
return log_handler1_impl(change, disable, filename, target); return log_handler1_impl(change, append, disable, filename, target);
} }
// Disables logs entirely at runtime. // Disables logs entirely at runtime.
// Makes LOG() and LOG_TEE() produce no output, // Makes LOG() and LOG_TEE() produce no output,
// untill enabled back. // until enabled back.
#define log_disable() log_disable_impl() #define log_disable() log_disable_impl()
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_disable_impl() inline FILE *log_disable_impl()
{ {
return log_handler1_impl(true, LogTriStateTrue); return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
} }
// Enables logs at runtime. // Enables logs at runtime.
@ -419,19 +436,31 @@ inline FILE *log_disable_impl()
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_enable_impl() inline FILE *log_enable_impl()
{ {
return log_handler1_impl(true, LogTriStateFalse); return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
} }
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target) #define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); } inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); } inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); } inline FILE *log_handler() { return log_handler1_impl(); }
// Enable or disable creating separate log files for each run.
// can ONLY be invoked BEFORE first log use.
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
// Enable or disable append mode for log file.
// can ONLY be invoked BEFORE first log use.
#define log_append(enable) log_append_impl(enable)
// INTERNAL, DO NOT USE
inline FILE *log_append_impl(bool enable)
{
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
}
inline void log_test() inline void log_test()
{ {
log_disable(); log_disable();
@ -493,6 +522,18 @@ inline bool log_param_single_parse(const std::string & param)
return true; return true;
} }
if (param == "--log-new")
{
log_multilog(true);
return true;
}
if (param == "--log-append")
{
log_append(true);
return true;
}
return false; return false;
} }
@ -522,7 +563,9 @@ inline void log_print_usage()
printf(" --log-disable Disable trace logs\n"); printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n"); printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n"); printf(" --log-file Specify a log filename (without extension)\n");
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */ printf(" --log-new Create a separate new log file on start. "
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
printf(" --log-append Don't truncate the old log file.\n");
} }
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
@ -579,38 +622,75 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
return buf.str(); return buf.str();
} }
#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \ template <typename C, typename T>
[&tokens, &ctx]() \ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
{ \ {
std::stringstream buf; \ std::stringstream buf;
buf << "[ "; \ buf << "[ ";
\
bool first = true; \ bool first = true;
for (const auto &token : tokens) \ for (const auto &token : tokens)
{ \ {
if (!first) \ if (!first) {
buf << ", "; \ buf << ", ";
else \ } else {
first = false; \ first = false;
\ }
auto detokenized = llama_token_to_piece(ctx, token); \
\ auto detokenized = llama_token_to_piece(ctx, token);
detokenized.erase( \
std::remove_if( \ detokenized.erase(
detokenized.begin(), \ std::remove_if(
detokenized.end(), \ detokenized.begin(),
[](const unsigned char c) { return !std::isprint(c); }), \ detokenized.end(),
detokenized.end()); \ [](const unsigned char c) { return !std::isprint(c); }),
\ detokenized.end());
buf \
<< "'" << detokenized << "'" \ buf
<< ":" << std::to_string(token); \ << "'" << detokenized << "'"
} \ << ":" << std::to_string(token);
buf << " ]"; \ }
\ buf << " ]";
return buf.str(); \
}() \ return buf.str();
.c_str() }
template <typename C, typename B>
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (int i = 0; i < batch.n_tokens; ++i)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "\n" << std::to_string(i)
<< ":token '" << detokenized << "'"
<< ":pos " << std::to_string(batch.pos[i])
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
<< ":logits " << std::to_string(batch.logits[i]);
}
buf << " ]";
return buf.str();
}
#ifdef LOG_DISABLE_LOGS #ifdef LOG_DISABLE_LOGS

269
common/sampling.cpp Normal file
View file

@ -0,0 +1,269 @@
#include "sampling.h"
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context();
result->params = params;
result->grammar = nullptr;
// if there is a grammar, parse it
if (!params.grammar.empty()) {
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (result->parsed_grammar.rules.empty()) {
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
return nullptr;
}
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
result->grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
}
result->prev.resize(params.n_prev);
return result;
}
void llama_sampling_free(struct llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
}
delete ctx;
}
void llama_sampling_reset(llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
ctx->grammar = NULL;
}
if (!ctx->parsed_grammar.rules.empty()) {
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
ctx->grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
}
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear();
}
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
if (dst->grammar) {
llama_grammar_free(dst->grammar);
dst->grammar = nullptr;
}
if (src->grammar) {
dst->grammar = llama_grammar_copy(src->grammar);
}
dst->prev = src->prev;
}
llama_token llama_sampling_last(llama_sampling_context * ctx) {
return ctx->prev.back();
}
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
const int size = ctx_sampling->prev.size();
n = std::min(n, size);
std::string result;
for (int i = size - n; i < size; i++) {
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
}
return result;
}
std::string llama_sampling_print(const llama_sampling_params & params) {
char result[1024];
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
return std::string(result);
}
std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) {
for (auto s : params.samplers_sequence) {
switch (s) {
case 'k': result += "-> top_k "; break;
case 'f': result += "-> tfs_z "; break;
case 'y': result += "-> typical_p "; break;
case 'p': result += "-> top_p "; break;
case 'm': result += "-> min_p "; break;
case 't': result += "-> temp "; break;
default : break;
}
}
} else {
result += "-> mirostat ";
}
return result;
}
// no reasons to expose this function in header
static void sampler_queue(
struct llama_context * ctx_main,
const llama_sampling_params & params,
llama_token_data_array & cur_p,
size_t & min_keep) {
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const std::string & samplers_sequence = params.samplers_sequence;
for (auto s : samplers_sequence) {
switch (s){
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
default : break;
}
}
}
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp;
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;
llama_token id = 0;
float * logits = llama_get_logits_ith(ctx_main, idx);
// apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
cur.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
if (ctx_cfg) {
llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
}
// apply penalties
if (!prev.empty()) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
llama_sample_repetition_penalties(ctx_main, &cur_p,
prev.data() + prev.size() - penalty_last_n,
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
if (ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
}
if (temp < 0.0) {
// greedy sampling, with probs
llama_sample_softmax(ctx_main, &cur_p);
id = cur_p.data[0].id;
} else if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx_main, &cur_p);
} else {
if (mirostat == 1) {
const int mirostat_m = 100;
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
} else if (mirostat == 2) {
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else {
// temperature sampling
size_t min_keep = std::max(1, params.n_probs);
sampler_queue(ctx_main, params, cur_p, min_keep);
id = llama_sample_token(ctx_main, &cur_p);
//{
// const int n_top = 10;
// LOG("top %d candidates:\n", n_top);
// for (int i = 0; i < n_top; i++) {
// const llama_token id = cur_p.data[i].id;
// (void)id; // To avoid a warning that id is unused when logging is disabled.
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
// }
//}
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
}
}
return id;
}
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar) {
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
}
}

114
common/sampling.h Normal file
View file

@ -0,0 +1,114 @@
#pragma once
#include "llama.h"
#include "grammar-parser.h"
#include <string>
#include <vector>
#include <unordered_map>
// sampling parameters
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = true; // consider newlines as a repeatable token
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
std::string grammar; // optional BNF-like grammar to constrain sampling
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // how strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
} llama_sampling_params;
// general sampler context
// TODO: move to llama.h
struct llama_sampling_context {
// parameters that will be used for sampling
llama_sampling_params params;
// mirostat sampler state
float mirostat_mu;
llama_grammar * grammar;
// internal
grammar_parser::parse_state parsed_grammar;
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;
};
#include "common.h"
// Create a new sampling context instance.
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
void llama_sampling_free(struct llama_sampling_context * ctx);
// Reset the sampler context
// - clear prev tokens
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);
// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
// Get the last sampled token
llama_token llama_sampling_last(llama_sampling_context * ctx);
// Get a string representation of the last sampled tokens
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
// Print sampling parameters into a string
std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
// llama_sampling_reset when a sequence ends
//
// required:
// - ctx_main: context to use for sampling
// - ctx_sampling: sampling-specific context
//
// optional:
// - ctx_cfg: context to use for classifier-free guidance
// - idx: sample from llama_get_logits_ith(ctx, idx)
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0);
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar);

8396
common/stb_image.h Normal file

File diff suppressed because it is too large Load diff

View file

@ -32,6 +32,7 @@ struct train_state * init_train_state() {
state->opt = new struct ggml_opt_context; state->opt = new struct ggml_opt_context;
state->opt->ctx = NULL; state->opt->ctx = NULL;
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM); state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
state->opt->loss_after = 0.0f; state->opt->loss_after = 0.0f;
return state; return state;
@ -70,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
float scale = 1.0f; // xavier float scale = 1.0f; // xavier
switch (tensor->n_dims) { switch (ggml_n_dims(tensor)) {
case 1: case 1:
scale /= sqrtf((float) tensor->ne[0]); scale /= sqrtf((float) tensor->ne[0]);
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@ -118,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
} }
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
switch (tensor->n_dims) { switch (ggml_n_dims(tensor)) {
case 1: case 1:
for (int i0 = 0; i0 < tensor->ne[0]; i0++) { for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@ -182,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
} }
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
GGML_ASSERT(tensor->n_dims == 1);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == 1);
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
} }
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
GGML_ASSERT(tensor->n_dims == 2);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
} }
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
GGML_ASSERT(tensor->n_dims == 3);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == ne2); GGML_ASSERT(tensor->ne[2] == ne2);
GGML_ASSERT(tensor->ne[3] == 1);
} }
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
GGML_ASSERT(tensor->n_dims == 4);
GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[0] == ne0);
GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[1] == ne1);
GGML_ASSERT(tensor->ne[2] == ne2); GGML_ASSERT(tensor->ne[2] == ne2);
@ -224,8 +227,8 @@ int64_t get_example_targets_batch(
bool sample_random_offsets bool sample_random_offsets
) { ) {
GGML_ASSERT(samples_count > 0); GGML_ASSERT(samples_count > 0);
GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(ggml_is_matrix(tokens_input));
GGML_ASSERT(target_probs->n_dims == 3); GGML_ASSERT(ggml_is_3d(target_probs));
int64_t n_vocab = target_probs->ne[0]; int64_t n_vocab = target_probs->ne[0];
int64_t n_tokens = tokens_input->ne[0]; int64_t n_tokens = tokens_input->ne[0];
int64_t n_batch = tokens_input->ne[1]; int64_t n_batch = tokens_input->ne[1];
@ -236,8 +239,8 @@ int64_t get_example_targets_batch(
int64_t used_samples = 0; int64_t used_samples = 0;
ggml_set_f32(target_probs, 0.0f); ggml_set_f32(target_probs, 0.0f);
llama_token bos = llama_token_bos(lctx); llama_token bos = llama_token_bos(llama_get_model(lctx));
llama_token eos = llama_token_eos(lctx); llama_token eos = llama_token_eos(llama_get_model(lctx));
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
for (int k=0; k<n_batch; ++k) { for (int k=0; k<n_batch; ++k) {
// printf("%s: batch %d\n", __func__, k); // printf("%s: batch %d\n", __func__, k);
@ -863,7 +866,7 @@ size_t tokenize_file(
(int) buf.size(), (int) buf.size(),
out_tokens.data(), out_tokens.data(),
(int) out_tokens.size(), (int) out_tokens.size(),
false); false, false);
if (n_tokens < 0) { if (n_tokens < 0) {
out_tokens.resize(-n_tokens); out_tokens.resize(-n_tokens);
n_tokens = llama_tokenize( n_tokens = llama_tokenize(
@ -872,7 +875,7 @@ size_t tokenize_file(
(int) buf.size(), (int) buf.size(),
out_tokens.data(), out_tokens.data(),
(int) out_tokens.size(), (int) out_tokens.size(),
false); false, false);
} }
if (n_tokens >= 0) { if (n_tokens >= 0) {
out_tokens.resize(n_tokens); out_tokens.resize(n_tokens);
@ -924,7 +927,7 @@ size_t tokenize_file(
for (llama_token token=0; token < n_vocab; ++token) { for (llama_token token=0; token < n_vocab; ++token) {
max_token_text_size = std::max( max_token_text_size = std::max(
max_token_text_size, max_token_text_size,
strlen(llama_token_get_text(lctx, token))); strlen(llama_token_get_text(llama_get_model(lctx), token)));
} }
// upper bound of context byte length. // upper bound of context byte length.
@ -966,7 +969,7 @@ size_t tokenize_file(
(int) buf_sample.size(), (int) buf_sample.size(),
tok_sample.data(), tok_sample.data(),
(int) tok_sample.size(), (int) tok_sample.size(),
false); false, false);
if (n_tokens < 0) { if (n_tokens < 0) {
tok_sample.resize(-n_tokens); tok_sample.resize(-n_tokens);
n_tokens = llama_tokenize(llama_get_model(lctx), n_tokens = llama_tokenize(llama_get_model(lctx),
@ -974,7 +977,7 @@ size_t tokenize_file(
(int) buf_sample.size(), (int) buf_sample.size(),
tok_sample.data(), tok_sample.data(),
(int) tok_sample.size(), (int) tok_sample.size(),
false); false, false);
GGML_ASSERT(n_tokens >= 0); GGML_ASSERT(n_tokens >= 0);
} }
GGML_ASSERT(n_tokens <= (int) tok_sample.size()); GGML_ASSERT(n_tokens <= (int) tok_sample.size());
@ -1045,6 +1048,7 @@ struct train_params_common get_default_train_params_common() {
params.n_batch = 8; params.n_batch = 8;
params.n_gradient_accumulation = 1; params.n_gradient_accumulation = 1;
params.n_epochs = -1; params.n_epochs = -1;
params.n_gpu_layers = 0;
params.custom_n_ctx = false; params.custom_n_ctx = false;
@ -1080,6 +1084,7 @@ struct train_params_common get_default_train_params_common() {
params.adam_beta2 = 0.999f; params.adam_beta2 = 0.999f;
params.adam_gclip = 1.0f; params.adam_gclip = 1.0f;
params.adam_eps_f = 0.0f; params.adam_eps_f = 0.0f;
return params; return params;
} }
@ -1133,6 +1138,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
fprintf(stderr, " -ngl N, --n-gpu-layers N Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
@ -1352,6 +1358,17 @@ bool consume_common_train_arg(
return true; return true;
} }
params->adam_gclip = std::stof(argv[i]); params->adam_gclip = std::stof(argv[i]);
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
*invalid_param = true;
return true;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params->n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "-h" || arg == "--help") { } else if (arg == "-h" || arg == "--help") {
params->print_usage = true; params->print_usage = true;
return true; return true;
@ -1425,7 +1442,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
if (impr_plot > 0) impr_plot = 0; if (impr_plot > 0) impr_plot = 0;
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0; if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f", printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count, __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
*sched, opt->loss_after); *sched, opt->loss_after);

View file

@ -9,6 +9,8 @@
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#define LLAMA_TRAIN_MAX_NODES 16384
typedef std::string mt19937_state; typedef std::string mt19937_state;
struct train_state { struct train_state {
@ -44,6 +46,7 @@ struct train_params_common {
int n_batch; int n_batch;
int n_gradient_accumulation; int n_gradient_accumulation;
int n_epochs; int n_epochs;
int n_gpu_layers;
bool custom_n_ctx; bool custom_n_ctx;

View file

@ -1,304 +0,0 @@
#!/usr/bin/env python3
# HF baichuan --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import itertools
import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if TYPE_CHECKING:
from typing import TypeAlias
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
# reverse HF permute back to original pth layout
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
r = weights.shape[0] // 3
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
r = weights.shape[0] // 3
return weights[r * n_part : r * n_part + r, ...]
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
print("hello print: ",hparams["architectures"][0])
if hparams["architectures"][0] != "BaichuanForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
elif "model_max_length" in hparams:
ctx_length = hparams["model_max_length"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
tmp=model_part
for i in range(block_count):
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -1,275 +0,0 @@
#!/usr/bin/env python3
# HF falcon--> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "RWForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.FALCON
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("Falcon")
gguf_writer.add_context_length(2048) # not in config.json
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams:
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
gguf_writer.add_token_list(tokens)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["hidden_size"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
# QKV tensor transform
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
if "query_key_value" in name:
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
data = torch.cat((q,k,v)).reshape_as(data)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -1,251 +0,0 @@
#!/usr/bin/env python3
# HF gptneox--> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTNeoXForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.GPTNEOX
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
gguf_writer.add_name(dir_model.name)
gguf_writer.add_context_length(hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
gguf_writer.add_head_count(hparams["num_attention_heads"])
gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
vocab_size = len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
gguf_writer.add_token_list(tokens)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

1048
convert-hf-to-gguf.py Executable file

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,6 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import math
import struct import struct
import sys import sys
from enum import IntEnum from enum import IntEnum
@ -12,34 +11,16 @@ import numpy as np
import os import os
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
# Note: Does not support GGML_QKK_64
QK_K = 256
# Items here are (block size, type size)
GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.F32 : (1, 4),
gguf.GGMLQuantizationType.F16 : (1, 2),
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
}
class GGMLFormat(IntEnum): class GGMLFormat(IntEnum):
GGML = 0 GGML = 0
GGMF = 1 GGMF = 1
GGJT = 2 GGJT = 2
class GGMLFType(IntEnum): class GGMLFType(IntEnum):
ALL_F32 = 0 ALL_F32 = 0
MOSTLY_F16 = 1 MOSTLY_F16 = 1
@ -59,6 +40,7 @@ class GGMLFType(IntEnum):
MOSTLY_Q5_K_M = 17 MOSTLY_Q5_K_M = 17
MOSTLY_Q6_K = 18 MOSTLY_Q6_K = 18
class Hyperparameters: class Hyperparameters:
def __init__(self): def __init__(self):
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0 self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
@ -90,6 +72,7 @@ class Hyperparameters:
def __str__(self): def __str__(self):
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>' return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
class Vocab: class Vocab:
def __init__(self, load_scores = True): def __init__(self, load_scores = True):
self.items = [] self.items = []
@ -111,6 +94,7 @@ class Vocab:
self.items.append((item_text, item_score)) self.items.append((item_text, item_score))
return offset - orig_offset return offset - orig_offset
class Tensor: class Tensor:
def __init__(self, use_padding = True): def __init__(self, use_padding = True):
self.name = None self.name = None
@ -125,7 +109,7 @@ class Tensor:
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12]) (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}' assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
assert name_len < 4096, 'Absurd tensor name length' assert name_len < 4096, 'Absurd tensor name length'
quant = GGML_QUANT_SIZES.get(dtype) quant = gguf.GGML_QUANT_SIZES.get(dtype)
assert quant is not None, 'Unknown tensor type' assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant (blksize, tysize) = quant
offset += 12 offset += 12
@ -144,6 +128,7 @@ class Tensor:
# print(n_dims, name_len, dtype, self.dims, self.name, pad) # print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset return offset - orig_offset
class GGMLModel: class GGMLModel:
def __init__(self): def __init__(self):
self.hyperparameters = None self.hyperparameters = None
@ -180,8 +165,8 @@ class GGMLModel:
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16): if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.' err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2): elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
err = 'Q4 and Q8 quantizations changed in GGJTv3.' err = 'Q4 and Q8 quantizations changed in GGJTv3.'
if len(err) > 0: if len(err) > 0:
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.') raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
@ -208,6 +193,7 @@ class GGMLModel:
hp.set_n_ff(self) hp.set_n_ff(self)
return offset return offset
class GGMLToGGUF: class GGMLToGGUF:
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None): def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
hp = ggml_model.hyperparameters hp = ggml_model.hyperparameters
@ -238,7 +224,7 @@ class GGMLToGGUF:
gguf_writer = gguf.GGUFWriter( gguf_writer = gguf.GGUFWriter(
self.cfg.output, self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
use_temp_file = False ) use_temp_file = False)
self.add_params(gguf_writer) self.add_params(gguf_writer)
self.add_vocab(gguf_writer) self.add_vocab(gguf_writer)
if self.special_vocab is not None: if self.special_vocab is not None:
@ -362,7 +348,8 @@ class GGMLToGGUF:
mapped_name, mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
raw_shape = tempdims, raw_shape = tempdims,
raw_dtype = tensor.dtype ) raw_dtype = tensor.dtype)
def handle_metadata(cfg, hp): def handle_metadata(cfg, hp):
import convert import convert
@ -386,36 +373,40 @@ def handle_metadata(cfg, hp):
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab = convert.load_vocab( vocab = convert.load_vocab(
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype ) cfg.vocabtype)
# FIXME: Respect cfg.vocab_dir? # FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir) svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
load_merges = cfg.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return (params, vocab, svocab) return (params, vocab, svocab)
def handle_args(): def handle_args():
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF') parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
parser.add_argument('--input', '-i', type = Path, required = True, parser.add_argument('--input', '-i', type = Path, required = True,
help = 'Input GGMLv3 filename') help = 'Input GGMLv3 filename')
parser.add_argument('--output', '-o', type = Path, required = True, parser.add_argument('--output', '-o', type = Path, required = True,
help ='Output GGUF filename') help ='Output GGUF filename')
parser.add_argument('--name', parser.add_argument('--name',
help = 'Set model name') help = 'Set model name')
parser.add_argument('--desc', parser.add_argument('--desc',
help = 'Set model description') help = 'Set model description')
parser.add_argument('--gqa', type = int, default = 1, parser.add_argument('--gqa', type = int, default = 1,
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
parser.add_argument('--eps', default = '5.0e-06', parser.add_argument('--eps', default = '5.0e-06',
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
parser.add_argument('--context-length', '-c', type=int, default = 2048, parser.add_argument('--context-length', '-c', type=int, default = 2048,
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
parser.add_argument('--model-metadata-dir', '-m', type = Path, parser.add_argument('--model-metadata-dir', '-m', type = Path,
help ='Load HuggingFace/.pth vocab and metadata from the specified directory') help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path, parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
return parser.parse_args() return parser.parse_args()
def main(): def main():
cfg = handle_args() cfg = handle_args()
print(f'* Using config: {cfg}') print(f'* Using config: {cfg}')
@ -425,7 +416,7 @@ def main():
data = np.memmap(cfg.input, mode = 'r') data = np.memmap(cfg.input, mode = 'r')
model = GGMLModel() model = GGMLModel()
print('* Scanning GGML input file') print('* Scanning GGML input file')
offset = model.load(data, 0) offset = model.load(data, 0) # noqa
print(f'* GGML model hyperparameters: {model.hyperparameters}') print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None vocab_override = None
params_override = None params_override = None
@ -440,12 +431,15 @@ def main():
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
if model.file_format == GGMLFormat.GGML: if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(model, data, cfg, converter = GGMLToGGUF(
model, data, cfg,
params_override = params_override, params_override = params_override,
vocab_override = vocab_override, vocab_override = vocab_override,
special_vocab = special_vocab ) special_vocab = special_vocab
)
converter.save() converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}') print(f'* Successful completion. Output saved to: {cfg.output}')
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View file

@ -0,0 +1,132 @@
import torch
import os
from pprint import pprint
import sys
import argparse
from pathlib import Path
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
def _flatten_dict(dct, tensors, prefix=None):
assert isinstance(dct, dict)
for key in dct.keys():
new_prefix = prefix + '.' + key if prefix is not None else key
if isinstance(dct[key], torch.Tensor):
tensors[new_prefix] = dct[key]
elif isinstance(dct[key], dict):
_flatten_dict(dct[key], tensors, new_prefix)
else:
raise ValueError(type(dct[key]))
return None
def _get_sentencepiece_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model'
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
print('gguf: adding tokens')
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
pass
return tokens, scores, toktypes
def main():
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
args = parser.parse_args()
sys.path.append(str(args.adept_inference_dir))
persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args']
pprint(hparams)
tensors = {}
_flatten_dict(persimmon_model['model'], tensors, None)
arch = gguf.MODEL_ARCH.PERSIMMON
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
block_count = hparams.num_layers
head_count = hparams.num_attention_heads
head_count_kv = head_count
ctx_length = hparams.seq_length
hidden_size = hparams.hidden_size
gguf_writer.add_name('persimmon-8b-chat')
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hidden_size)
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
gguf_writer.add_rope_dimension_count(hidden_size // head_count)
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
gguf_writer.add_bos_token_id(71013)
gguf_writer.add_eos_token_id(71013)
tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map)
for name in tensors.keys():
data = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{args.outfile}'")
print("")
if __name__ == '__main__':
main()

View file

@ -1,242 +0,0 @@
#!/usr/bin/env python3
# HF starcoder --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.STARCODER
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("StarCoder")
gguf_writer.add_context_length(hparams["n_positions"])
gguf_writer.add_embedding_length(hparams["n_embd"])
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
gguf_writer.add_token_list(tokens)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["n_embd"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -3,15 +3,14 @@ from __future__ import annotations
import argparse import argparse
import concurrent.futures import concurrent.futures
import copy
import enum import enum
import faulthandler import faulthandler
import functools import functools
import io
import itertools import itertools
import json import json
import math import math
import mmap import mmap
import os
import pickle import pickle
import re import re
import signal import signal
@ -20,17 +19,17 @@ import sys
import time import time
import zipfile import zipfile
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from collections import OrderedDict
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor # type: ignore[import] from sentencepiece import SentencePieceProcessor
import os
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
if TYPE_CHECKING: if TYPE_CHECKING:
@ -41,14 +40,15 @@ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
NDArray: TypeAlias = 'np.ndarray[Any, Any]' NDArray: TypeAlias = 'np.ndarray[Any, Any]'
ARCH=gguf.MODEL_ARCH.LLAMA ARCH = gguf.MODEL_ARCH.LLAMA
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
DEFAULT_CONCURRENCY = 8 DEFAULT_CONCURRENCY = 8
# #
# data types # data types
# #
@dataclass(frozen=True) @dataclass(frozen=True)
class DataType: class DataType:
name: str name: str
@ -58,14 +58,17 @@ class DataType:
def elements_to_bytes(self, n_elements: int) -> int: def elements_to_bytes(self, n_elements: int) -> int:
return n_elements * self.dtype.itemsize return n_elements * self.dtype.itemsize
@dataclass(frozen=True) @dataclass(frozen=True)
class UnquantizedDataType(DataType): class UnquantizedDataType(DataType):
pass pass
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
@dataclass(frozen=True) @dataclass(frozen=True)
class QuantizedDataType(DataType): class QuantizedDataType(DataType):
@ -80,6 +83,7 @@ class QuantizedDataType(DataType):
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
return self.quantized_dtype.itemsize * (n_elements // self.block_size) return self.quantized_dtype.itemsize * (n_elements // self.block_size)
@dataclass(frozen=True) @dataclass(frozen=True)
class Q8_0QuantizedDataType(QuantizedDataType): class Q8_0QuantizedDataType(QuantizedDataType):
# Mini Q8_0 quantization in Python! # Mini Q8_0 quantization in Python!
@ -89,6 +93,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
n_blocks = arr.size // self.block_size n_blocks = arr.size // self.block_size
blocks = arr.reshape((n_blocks, self.block_size)) blocks = arr.reshape((n_blocks, self.block_size))
# Much faster implementation of block quantization contributed by @Cebtenzzre # Much faster implementation of block quantization contributed by @Cebtenzzre
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
d = abs(blocks).max(axis = 1) / np.float32(127) d = abs(blocks).max(axis = 1) / np.float32(127)
with np.errstate(divide = 'ignore'): with np.errstate(divide = 'ignore'):
@ -97,10 +102,11 @@ class Q8_0QuantizedDataType(QuantizedDataType):
yield from zip(d, qs) yield from zip(d, qs)
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
dtype = np.dtype(np.float32), valid_conversions = [], dtype = np.dtype(np.float32), valid_conversions = [],
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))])) quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
# Quantized types skipped here because they may also map to np.float32 # Quantized types skipped here because they may also map to np.float32
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {} NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
@ -119,6 +125,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
# TODO: match this with `llama_ftype` # TODO: match this with `llama_ftype`
# TODO: rename to LLAMAFileType # TODO: rename to LLAMAFileType
# TODO: move to `gguf.py` # TODO: move to `gguf.py`
class GGMLFileType(enum.IntEnum): class GGMLFileType(enum.IntEnum):
AllF32 = 0 AllF32 = 0
MostlyF16 = 1 # except 1d tensors MostlyF16 = 1 # except 1d tensors
@ -131,6 +139,7 @@ class GGMLFileType(enum.IntEnum):
# 1D tensors are always F32. # 1D tensors are always F32.
return dt if len(tensor.shape) > 1 else DT_F32 return dt if len(tensor.shape) > 1 else DT_F32
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
GGMLFileType.AllF32 : DT_F32, GGMLFileType.AllF32 : DT_F32,
GGMLFileType.MostlyF16 : DT_F16, GGMLFileType.MostlyF16 : DT_F16,
@ -141,19 +150,25 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
# hparams loading # hparams loading
# #
@dataclass @dataclass
class Params: class Params:
n_vocab: int n_vocab: int
n_embd: int n_embd: int
n_layer: int n_layer: int
n_ctx: int n_ctx: int
n_ff: int n_ff: int
n_head: int n_head: int
n_head_kv: int n_head_kv: int
f_norm_eps: float n_experts: int | None = None
n_experts_used: int | None = None
f_norm_eps: float | None = None
rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None f_rope_freq_base: float | None = None
f_rope_scale: float | None = None f_rope_scale: float | None = None
n_orig_ctx: int | None = None
rope_finetuned: bool | None = None
ftype: GGMLFileType | None = None ftype: GGMLFileType | None = None
@ -167,11 +182,11 @@ class Params:
# try transformer naming first # try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model: if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
else: else:
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
if n_layer < 1: if n_layer < 1:
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
@ -199,20 +214,20 @@ class Params:
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
n_vocab = config["vocab_size"] rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
n_embd = config["hidden_size"]
n_layer = config["num_hidden_layers"]
n_ff = config["intermediate_size"]
n_head = config["num_attention_heads"]
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
f_norm_eps = config["rms_norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
rope_scaling = config.get("rope_scaling") rope_scaling = config.get("rope_scaling")
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
f_rope_scale = config["rope_scaling"].get("factor") if rope_scaling is not None and (typ := rope_scaling.get("type")):
else: rope_factor = rope_scaling.get("factor")
f_rope_scale = None f_rope_scale = rope_factor
if typ == "linear":
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
else:
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
if "max_sequence_length" in config: if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"] n_ctx = config["max_sequence_length"]
@ -222,17 +237,29 @@ class Params:
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.") "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
n_experts = None
n_experts_used = None
if "num_local_experts" in config:
n_experts = config["num_local_experts"]
n_experts_used = config["num_experts_per_tok"]
return Params( return Params(
n_vocab = n_vocab, n_vocab = config["vocab_size"],
n_embd = n_embd, n_embd = config["hidden_size"],
n_layer = n_layer, n_layer = config["num_hidden_layers"],
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = config["intermediate_size"],
n_head = n_head, n_head = (n_head := config["num_attention_heads"]),
n_head_kv = n_head_kv, n_head_kv = config.get("num_key_value_heads", n_head),
f_norm_eps = f_norm_eps, n_experts = n_experts,
f_rope_freq_base = f_rope_freq_base, n_experts_used = n_experts_used,
f_rope_scale = f_rope_scale, f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale,
n_orig_ctx = n_orig_ctx,
rope_finetuned = rope_finetuned,
) )
# LLaMA v2 70B params.json # LLaMA v2 70B params.json
@ -241,17 +268,15 @@ class Params:
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
n_vocab = config["vocab_size"] if "vocab_size" in config else -1 n_experts = None
n_embd = config["dim"] n_experts_used = None
n_layer = config["n_layers"] f_rope_freq_base = None
n_ff = -1
n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
f_norm_eps = config["norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
# hack to determine LLaMA v1 vs v2 vs CodeLlama # hack to determine LLaMA v1 vs v2 vs CodeLlama
if f_rope_freq_base == 1000000: if config.get("moe"):
# Mixtral
n_ctx = 32768
elif config.get("rope_theta") == 1000000:
# CodeLlama # CodeLlama
n_ctx = 16384 n_ctx = 16384
elif config["norm_eps"] == 1e-05: elif config["norm_eps"] == 1e-05:
@ -261,22 +286,27 @@ class Params:
# LLaMA v1 # LLaMA v1
n_ctx = 2048 n_ctx = 2048
if n_vocab == -1: if "layers.0.feed_forward.w1.weight" in model:
n_vocab = model["tok_embeddings.weight"].shape[0]
if n_ff == -1:
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
if config.get("moe"):
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
n_experts = config["moe"]["num_experts"]
n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6
return Params( return Params(
n_vocab = n_vocab, n_vocab = model["tok_embeddings.weight"].shape[0],
n_embd = n_embd, n_embd = config["dim"],
n_layer = n_layer, n_layer = config["n_layers"],
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = n_ff,
n_head = n_head, n_head = (n_head := config["n_heads"]),
n_head_kv = n_head_kv, n_head_kv = config.get("n_kv_heads", n_head),
f_norm_eps = f_norm_eps, n_experts = n_experts,
f_rope_freq_base = f_rope_freq_base, n_experts_used = n_experts_used,
f_norm_eps = config["norm_eps"],
f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
) )
@staticmethod @staticmethod
@ -298,151 +328,152 @@ class Params:
return params return params
# class VocabLoader:
# vocab def __init__(self, params: Params, fname_tokenizer: Path) -> None:
# try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"To use VocabLoader, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
class BpeVocab: try:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) except ValueError:
added_tokens: dict[str, int] self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
continue
self.added_tokens_dict[tok] = tokidx
self.unk_token_id: int = self.tokenizer.unk_token_id
self.specials: dict[str, int] = {
tok: self.tokenizer.get_vocab()[tok]
for tok in self.tokenizer.all_special_tokens
}
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
self.vocab_size_base: int = self.tokenizer.vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
self.fname_tokenizer: Path = fname_tokenizer
vocab_file = "tokenizer.model"
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
if path_candidate is not None:
self.spm = SentencePieceProcessor(str(path_candidate))
print(self.spm.vocab_size(), self.vocab_size_base)
else: else:
# Fall back to trying to find the added tokens in tokenizer.json self.spm = None
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
if not tokenizer_json_file.is_file():
added_tokens = {}
else:
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
added_tokens = dict(
(item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in self.bpe_tokenizer )
vocab_size: int = len(self.bpe_tokenizer) def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) tokenizer = self.tokenizer
actual_ids = sorted(added_tokens.values()) reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
if expected_ids != actual_ids: added_tokens_ids = set(self.added_tokens_dict.values())
expected_end_id = vocab_size + len(actual_ids) - 1
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) for i in range(self.vocab_size_base):
self.added_tokens_list = [text for (text, idx) in items] if i in added_tokens_ids:
self.vocab_size_base: int = vocab_size continue
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: text = reverse_vocab[i].encode("utf-8")
tokenizer = self.bpe_tokenizer yield text, self.get_token_score(i), self.get_token_type(i)
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
score = 0.0
for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8")
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
if i == 0 and text == b'<unk>':
toktype = gguf.TokenType.UNKNOWN
elif i == 1 or i == 2:
toktype = gguf.TokenType.CONTROL
elif i >= 3 and text.startswith(b'<0x'):
toktype = gguf.TokenType.BYTE
else:
toktype = gguf.TokenType.NORMAL
else:
toktype = gguf.TokenType.NORMAL
yield text, score, toktype
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def get_token_type(self, token_id: int) -> gguf.TokenType:
for text in self.added_tokens_list: toktype = gguf.TokenType.NORMAL
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: if self.spm is not None and token_id < self.spm.vocab_size():
yield from self.bpe_tokens() if self.spm.is_unknown(token_id):
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int]
if fname_added_tokens is not None:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i)
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i):
toktype = gguf.TokenType.UNKNOWN toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i): if self.spm.is_control(token_id):
toktype = gguf.TokenType.CONTROL
if self.spm.is_unused(token_id):
toktype = gguf.TokenType.UNUSED
if self.spm.is_byte(token_id):
toktype = gguf.TokenType.BYTE
else:
if token_id == self.unk_token_id:
toktype = gguf.TokenType.UNKNOWN
if token_id in self.special_ids:
toktype = gguf.TokenType.CONTROL toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined. return toktype
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.is_unused(i): def get_token_score(self, token_id: int) -> float:
toktype = gguf.TokenType.UNUSED if self.spm is not None and token_id < self.spm.vocab_size():
if tokenizer.is_byte(i): return cast(float, self.spm.get_score(token_id))
toktype = gguf.TokenType.BYTE return 0.0
yield text, score, toktype
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0 for text in self.added_tokens_dict:
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED if text in self.specials:
toktype = self.get_token_type(self.specials[text])
score = self.get_token_score(self.specials[text])
else:
toktype = gguf.TokenType.USER_DEFINED
score = -1000.0
yield text.encode("utf-8"), score, toktype
def has_newline_token(self) -> bool:
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.sentencepiece_tokens() yield from self.hf_tokens()
yield from self.added_tokens() yield from self.added_tokens()
def __repr__(self) -> str: def get_vocab_type(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" path_candidates = []
vocab_file = "tokenizer.model"
path_candidates.append(vocab_file)
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
if path_candidate is not None:
return "llama"
vocab_file = "vocab.json"
path_candidates.append(vocab_file)
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
if path_candidate is not None:
return "gpt2"
vocab_file = "tokenizer.json"
path_candidates.append(vocab_file)
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
if path_candidate:
if not self.has_newline_token():
return "gpt2"
return "llama"
raise FileNotFoundError(
f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir"
)
def __repr__(self) -> str:
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
Vocab: TypeAlias = 'VocabLoader'
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
# #
# data loading # data loading
# TODO: reuse (probably move to gguf.py?) # TODO: reuse (probably move to gguf.py?)
# #
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2) .swapaxes(1, 2)
.reshape(weights.shape)) .reshape(weights.shape))
class Tensor(metaclass=ABCMeta): class Tensor(metaclass=ABCMeta):
@ -523,7 +554,7 @@ class LazyTensor:
ret = self._load() ret = self._load()
# Should be okay if it maps to the same numpy type? # Should be okay if it maps to the same numpy type?
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
(self.data_type, ret.data_type, self.description) (self.data_type, ret.data_type, self.description)
return ret return ret
def astype(self, data_type: DataType) -> LazyTensor: def astype(self, data_type: DataType) -> LazyTensor:
@ -596,7 +627,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
if any("model.embed_tokens.weight" in mp.model for mp in models_plus): if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
# Transformers models put different tensors in different files, but # Transformers models put different tensors in different files, but
# don't split indivdual tensors between files. # don't split individual tensors between files.
model: LazyModel = {} model: LazyModel = {}
for mp in models_plus: for mp in models_plus:
model.update(mp.model) model.update(mp.model)
@ -611,6 +642,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
return lazy_tensor.load().permute(n_head, n_head_kv) return lazy_tensor.load().permute(n_head, n_head_kv)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
@ -618,6 +650,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
s[0] = s[0] // 3 s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
return lazy_tensor.load().part(n_part) return lazy_tensor.load().part(n_part)
@ -687,7 +720,7 @@ class LazyUnpickler(pickle.Unpickler):
return func(*args) return func(*args)
CLASSES: dict[tuple[str, str], Any] = { CLASSES: dict[tuple[str, str], Any] = {
# getattr used here as a workaround for mypy not being smart enough to detrmine # getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute. # the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
@ -713,6 +746,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
data_base_path=pickle_paths[0][:-4], data_base_path=pickle_paths[0][:-4],
zip_file=zf) zip_file=zf)
model = unpickler.load() model = unpickler.load()
if 'model' in model: model = model['model']
as_dict = dict(model.items()) as_dict = dict(model.items())
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
@ -766,6 +800,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
In = TypeVar('In') In = TypeVar('In')
Out = TypeVar('Out') Out = TypeVar('Out')
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]: def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
'''Parallel map, but with backpressure. If the caller doesn't call `next` '''Parallel map, but with backpressure. If the caller doesn't call `next`
fast enough, this will stop calling `func` at some point rather than fast enough, this will stop calling `func` at some point rather than
@ -800,26 +835,34 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
break break
yield result yield result
def check_vocab_size(params: Params, vocab: Vocab) -> None:
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
if params.n_vocab != vocab.vocab_size: if params.n_vocab != vocab.vocab_size:
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) if params.n_vocab == vocab.vocab_size:
if params.n_vocab == vocab.vocab_size_base:
print("Ignoring added_tokens.json since model matches vocab size without it.") print("Ignoring added_tokens.json since model matches vocab size without it.")
vocab.added_tokens_list = [] vocab.added_tokens_dict = OrderedDict()
vocab.vocab_size = vocab.vocab_size_base vocab.vocab_size = vocab.vocab_size
return
if pad_vocab and params.n_vocab > vocab.vocab_size:
pad_count = params.n_vocab - vocab.vocab_size
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
vocab.vocab_size = params.n_vocab
return return
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
if vocab.fname_added_tokens is not None:
msg += f" combined with {vocab.fname_added_tokens}"
msg += f" has {vocab.vocab_size})." msg += f" has {vocab.vocab_size})."
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None: if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
if vocab.vocab_size < params.n_vocab:
msg += " Possibly try using the --padvocab option."
raise Exception(msg) raise Exception(msg)
class OutputFile: class OutputFile:
def __init__(self, fname_out: Path) -> None: def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
name = "LLaMA" name = "LLaMA"
@ -838,13 +881,31 @@ class OutputFile:
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
self.gguf.add_head_count (params.n_head) self.gguf.add_head_count (params.n_head)
self.gguf.add_head_count_kv (params.n_head_kv) self.gguf.add_head_count_kv (params.n_head_kv)
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
if params.n_experts:
self.gguf.add_expert_count(params.n_experts)
if params.n_experts_used:
self.gguf.add_expert_used_count(params.n_experts_used)
if params.f_norm_eps:
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
else:
raise ValueError('f_norm_eps is None')
if params.f_rope_freq_base is not None: if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base) self.gguf.add_rope_freq_base(params.f_rope_freq_base)
if params.f_rope_scale is not None: if params.rope_scaling_type:
self.gguf.add_rope_scale_linear(params.f_rope_scale) assert params.f_rope_scale is not None
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
if params.n_orig_ctx is not None:
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
if params.rope_finetuned is not None:
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
if params.ftype is not None: if params.ftype is not None:
self.gguf.add_file_type(params.ftype) self.gguf.add_file_type(params.ftype)
@ -859,12 +920,8 @@ class OutputFile:
scores.append(score) scores.append(score)
toktypes.append(toktype) toktypes.append(toktype)
if isinstance(vocab, SentencePieceVocab): vocab_type = vocab.get_vocab_type()
self.gguf.add_tokenizer_model("llama") self.gguf.add_tokenizer_model(vocab_type)
elif isinstance(vocab, BpeVocab):
self.gguf.add_tokenizer_model("gpt2")
else:
raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
self.gguf.add_token_list(tokens) self.gguf.add_token_list(tokens)
self.gguf.add_token_scores(scores) self.gguf.add_token_scores(scores)
self.gguf.add_token_types(toktypes) self.gguf.add_token_types(toktypes)
@ -890,10 +947,14 @@ class OutputFile:
self.gguf.close() self.gguf.close()
@staticmethod @staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None: def write_vocab_only(
check_vocab_size(params, vocab) fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
of = OutputFile(fname_out) of = OutputFile(fname_out, endianess=endianess)
# meta data # meta data
of.add_meta_arch(params) of.add_meta_arch(params)
@ -918,10 +979,15 @@ class OutputFile:
return dt.quantize(arr) return dt.quantize(arr)
@staticmethod @staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: def write_all(
check_vocab_size(params, vocab) fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
of = OutputFile(fname_out) of = OutputFile(fname_out, endianess=endianess)
# meta data # meta data
of.add_meta_arch(params) of.add_meta_arch(params)
@ -952,8 +1018,9 @@ class OutputFile:
of.close() of.close()
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
return GGMLFileType.AllF32 return GGMLFileType.AllF32
@ -966,10 +1033,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
raise Exception(f"Unexpected combination of types: {name_to_type}") raise Exception(f"Unexpected combination of types: {name_to_type}")
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
for (name, tensor) in model.items()} for (name, tensor) in model.items()}
def convert_model_names(model: LazyModel, params: Params) -> LazyModel: def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer) tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
@ -982,7 +1051,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
print(f"Permuting layer {i}") print(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model: elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}") print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
@ -1007,6 +1076,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
return out return out
def nth_multifile_path(path: Path, n: int) -> Path | None: def nth_multifile_path(path: Path, n: int) -> Path | None:
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the nth path in the model. the nth path in the model.
@ -1051,7 +1121,8 @@ def load_some_model(path: Path) -> ModelPlus:
# Be extra-friendly and accept either a file or a directory: # Be extra-friendly and accept either a file or a directory:
if path.is_dir(): if path.is_dir():
# Check if it's a set of safetensors files first # Check if it's a set of safetensors files first
files = list(path.glob("model-00001-of-*.safetensors")) globs = ["model-00001-of-*.safetensors", "model.safetensors"]
files = [file for glob in globs for file in path.glob(glob)]
if not files: if not files:
# Try the PyTorch patterns too, with lower priority # Try the PyTorch patterns too, with lower priority
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
@ -1072,35 +1143,17 @@ def load_some_model(path: Path) -> ModelPlus:
return model_plus return model_plus
def load_vocab(path: Path, vocabtype: str | None) -> Vocab: def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
# Be extra-friendly and accept either a file or a directory. Also, if it's path2 = path / vocab_file
# a directory, it might be the model directory, and tokenizer.model might # Use `.parent` instead of /.. to handle the symlink case better.
# be in the parent of that. path3 = path.parent / vocab_file
if path.is_dir():
vocab_file = "tokenizer.model"
if vocabtype == 'bpe':
vocab_file = "vocab.json"
path2 = path / vocab_file
# Use `.parent` instead of /.. to handle the symlink case better.
path3 = path.parent / vocab_file
if path2.exists():
path = path2
elif path3.exists():
path = path3
else:
raise FileNotFoundError(
f"Could not find {vocab_file} in {path} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir")
print(f"Loading vocab file '{path}', type '{vocabtype}'") if path2.exists():
return path2
if path3.exists():
return path3
added_tokens_path = path.parent / "added_tokens.json" return None
if vocabtype == "bpe":
return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
elif vocabtype == "spm":
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@ -1127,19 +1180,24 @@ def do_dump_model(model_plus: ModelPlus) -> None:
def main(args_in: list[str] | None = None) -> None: def main(args_in: list[str] | None = None) -> None:
output_choices = ["f32", "f16"]
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0")
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
args = parser.parse_args(args_in) parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
args = parser.parse_args(args_in)
if args.dump_single: if args.dump_single:
model_plus = lazy_load_file(args.model) model_plus = lazy_load_file(args.model)
do_dump_model(model_plus) do_dump_model(model_plus)
@ -1153,6 +1211,9 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump: if args.dump:
do_dump_model(model_plus) do_dump_model(model_plus)
return return
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
params = Params.load(model_plus) params = Params.load(model_plus)
if params.n_ctx == -1: if params.n_ctx == -1:
@ -1174,12 +1235,16 @@ def main(args_in: list[str] | None = None) -> None:
vocab: Vocab vocab: Vocab
if args.vocab_only: if args.vocab_only:
assert args.outfile, "need --outfile if using --vocab-only" if not args.outfile:
raise ValueError("need --outfile if using --vocab-only")
# FIXME: Try to respect vocab_dir somehow? # FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) vocab = VocabLoader(params, args.vocab_dir or args.model)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = True,
n_vocab = vocab.vocab_size)
outfile = args.outfile outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess = endianess, pad_vocab = args.padvocab)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")
return return
@ -1187,10 +1252,15 @@ def main(args_in: list[str] | None = None) -> None:
vocab = model_plus.vocab vocab = model_plus.vocab
else: else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype) vocab = VocabLoader(params, vocab_dir)
# FIXME: Try to respect vocab_dir somehow?
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
# FIXME: Try to respect vocab_dir somehow?
print(f"Vocab info: {vocab}")
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = True,
n_vocab = vocab.vocab_size)
print(f"Special vocab info: {special_vocab}")
model = model_plus.model model = model_plus.model
model = convert_model_names(model, params) model = convert_model_names(model, params)
ftype = pick_output_type(model, args.outtype) ftype = pick_output_type(model, args.outtype)
@ -1200,7 +1270,8 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype params.ftype = ftype
print(f"Writing {outfile}, format {ftype}") print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency) OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
print(f"Wrote {outfile}") print(f"Wrote {outfile}")

View file

@ -49,7 +49,7 @@ According to the BLIS documentation, we could set the following
environment variables to modify the behavior of openmp: environment variables to modify the behavior of openmp:
```bash ```bash
export GOMP_GPU_AFFINITY="0-19" export GOMP_CPU_AFFINITY="0-19"
export BLIS_NUM_THREADS=14 export BLIS_NUM_THREADS=14
``` ```

BIN
docs/llama-star/idea-arch.key Executable file

Binary file not shown.

Binary file not shown.

View file

@ -17,7 +17,7 @@ llama_model_load_internal: [cublas] total VRAM used: 17223 MB
If you see these lines, then the GPU is being used. If you see these lines, then the GPU is being used.
## Verifying that the CPU is not oversaturated ## Verifying that the CPU is not oversaturated
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down. llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
# Example of runtime flags effect on inference speed benchmark # Example of runtime flags effect on inference speed benchmark
These runs were tested on the following machine: These runs were tested on the following machine:

View file

@ -12,24 +12,28 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN) if (EMSCRIPTEN)
else() else()
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(finetune)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(main) add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize) add_subdirectory(quantize)
add_subdirectory(quantize-stats) add_subdirectory(quantize-stats)
add_subdirectory(perplexity)
add_subdirectory(embedding)
add_subdirectory(save-load-state) add_subdirectory(save-load-state)
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch)
add_subdirectory(finetune)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(batched)
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(parallel) add_subdirectory(lookahead)
add_subdirectory(embd-input) add_subdirectory(train-text-from-scratch)
add_subdirectory(llama-bench)
add_subdirectory(beam-search)
if (LLAMA_METAL) if (LLAMA_METAL)
add_subdirectory(metal) add_subdirectory(metal)
endif() endif()

View file

@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
} }
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
assert(logits->n_dims == 2); assert(ggml_is_matrix(logits));
assert(probs->n_dims == 2); assert(ggml_is_matrix(probs));
assert(best_samples->n_dims == 1); assert(ggml_is_vector(best_samples));
assert(logits->ne[1] == best_samples->ne[0]); assert(logits->ne[1] == best_samples->ne[0]);
assert(logits->ne[0] == probs->ne[0]); assert(logits->ne[0] == probs->ne[0]);
assert(logits->ne[1] == probs->ne[1]); assert(logits->ne[1] == probs->ne[1]);
@ -1292,9 +1292,9 @@ static void sample_softmax_batch(
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
struct ggml_tensor * best_samples struct ggml_tensor * best_samples
) { ) {
GGML_ASSERT(best_samples->n_dims == 2); GGML_ASSERT(ggml_is_matrix(best_samples));
GGML_ASSERT(logits->n_dims == 3); GGML_ASSERT(ggml_is_3d(logits));
GGML_ASSERT(probs->n_dims == 3); GGML_ASSERT(ggml_is_3d(probs));
int n_tokens = best_samples->ne[0]; int n_tokens = best_samples->ne[0];
int n_batch = best_samples->ne[1]; int n_batch = best_samples->ne[1];
int n_vocab = logits->ne[0]; int n_vocab = logits->ne[0];
@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
} }
static void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(ggml_is_matrix(probs));
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
static void get_example_targets_batch( static void get_example_targets_batch(
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
) { ) {
GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(ggml_is_matrix(tokens_input));
GGML_ASSERT( targets->n_dims == 3); GGML_ASSERT(ggml_is_3d(targets));
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_batch = tokens_input->ne[1]; int n_batch = tokens_input->ne[1];
GGML_ASSERT(n_tokens == targets->ne[1]); GGML_ASSERT(n_tokens == targets->ne[1]);

View file

@ -0,0 +1,5 @@
set(TARGET batched-bench)
add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,51 @@
# llama.cpp/example/batched-bench
Benchmark the batched decoding performance of `llama.cpp`
## Usage
There are 2 modes of operation:
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
# custom set of batches
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
```
## Sample results
- `PP` - prompt tokens per batch
- `TG` - generated tokens per batch
- `B` - number of batches
- `N_KV` - required KV cache size
- `T_PP` - prompt processing time (i.e. time to first token)
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
- `T_TG` - time to generate all batches
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
- `T` - total time
- `S` - total speed (i.e. all tokens / total time)
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 |
| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 |
| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 |
| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 |
| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 |
| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 |
| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 |
| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 |
| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 |
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |

View file

@ -0,0 +1,247 @@
#include "common.h"
#include "llama.h"
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
// mutates the input string
static std::vector<int> parse_list(char * p) {
std::vector<int> ret;
char * q = p;
while (*p) {
if (*p == ',') {
*p = '\0';
ret.push_back(std::atoi(q));
q = p + 1;
}
++p;
}
ret.push_back(std::atoi(q));
return ret;
}
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}
int n_kv_max = 2048;
int is_pp_shared = 0;
int n_gpu_layers = 0;
int mmq = 0;
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
std::vector<int> n_tg = { 128, 256, };
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
n_kv_max = std::atoi(argv[2]);
}
if (argc >= 4) {
is_pp_shared = std::atoi(argv[3]);
}
if (argc >= 5) {
n_gpu_layers = std::atoi(argv[4]);
}
if (argc >= 6) {
mmq = std::atoi(argv[5]);
}
if (argc >= 7) {
n_pp = parse_list(argv[6]);
}
if (argc >= 8) {
n_tg = parse_list(argv[7]);
}
if (argc >= 9) {
n_pl = parse_list(argv[8]);
}
// init LLM
llama_backend_init(params.numa);
// initialize the model
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = n_gpu_layers;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = 512;
ctx_params.mul_mat_q = mmq;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
// decode in batches of ctx_params.n_batch tokens
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
}
return true;
};
// warm up
{
for (int i = 0; i < 16; ++i) {
llama_batch_add(batch, 0, i, { 0 }, false);
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
const int pp = n_pp[i_pp];
const int tg = n_tg[i_tg];
const int pl = n_pl[i_pl];
const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
if (n_ctx_req > n_kv_max) {
continue;
}
llama_batch_clear(batch);
const int n_tokens = is_pp_shared ? pp : pl*pp;
for (int i = 0; i < n_tokens; ++i) {
llama_batch_add(batch, 0, i, { 0 }, false);
}
batch.logits[batch.n_tokens - 1] = true;
const auto t_pp_start = ggml_time_us();
llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
}
}
const auto t_pp_end = ggml_time_us();
const auto t_tg_start = ggml_time_us();
for (int i = 0; i < tg; ++i) {
llama_batch_clear(batch);
for (int j = 0; j < pl; ++j) {
llama_batch_add(batch, 0, pp + i, { j }, true);
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
}
const auto t_tg_end = ggml_time_us();
const int32_t n_kv = n_ctx_req;
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
const float t = t_pp + t_tg;
const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
const float speed_tg = pl*tg / t_tg;
const float speed = n_kv / t;
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
}
}
}
llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}

9
examples/batched.swift/.gitignore vendored Normal file
View file

@ -0,0 +1,9 @@
.DS_Store
/.build
/Packages
xcuserdata/
DerivedData/
.swiftpm/configuration/registries.json
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
.netrc
batched_swift

View file

@ -0,0 +1,6 @@
.PHONY: build
build:
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
rm -f ./batched_swift
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift

View file

@ -0,0 +1,22 @@
// swift-tools-version: 5.5
// The swift-tools-version declares the minimum version of Swift required to build this package.
import PackageDescription
let package = Package(
name: "batched_swift",
platforms: [.macOS(.v12)],
dependencies: [
.package(name: "llama", path: "../../"),
],
targets: [
// Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies.
.executableTarget(
name: "batched_swift",
dependencies: ["llama"],
path: "Sources",
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
),
]
)

View file

@ -0,0 +1,4 @@
This is a swift clone of `examples/batched`.
$ `make`
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`

View file

@ -0,0 +1,260 @@
import Foundation
import llama
let arguments = CommandLine.arguments
// Check that we have at least one argument (the model path)
guard arguments.count > 1 else {
print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
exit(1)
}
let modelPath: String = arguments[1]
let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
// total length of the sequences including the prompt
let n_len: Int = 32
// init LLM
llama_backend_init(false)
defer {
llama_backend_free()
}
let model_params = llama_model_default_params()
guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
print("Failed to load model")
exit(1)
}
defer {
llama_free_model(model)
}
var tokens = tokenize(text: prompt, add_bos: true)
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
var context_params = llama_context_default_params()
context_params.seed = 1234
context_params.n_ctx = n_kv_req
context_params.n_batch = UInt32(max(n_len, n_parallel))
context_params.n_threads = 8
context_params.n_threads_batch = 8
let context = llama_new_context_with_model(model, context_params)
guard context != nil else {
print("Failed to initialize context")
exit(1)
}
defer {
llama_free(context)
}
let n_ctx = llama_n_ctx(context)
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
if n_kv_req > n_ctx {
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
exit(1)
}
var buffer: [CChar] = []
for id: llama_token in tokens {
print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
}
print("\n")
var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
defer {
llama_batch_free(batch)
}
// evaluate the initial prompt
batch.n_tokens = Int32(tokens.count)
for (i, token) in tokens.enumerated() {
batch.token[i] = token
batch.pos[i] = Int32(i)
batch.n_seq_id[i] = 1
// batch.seq_id[i][0] = 0
// TODO: is this the proper way to do this?
if let seq_id = batch.seq_id[i] {
seq_id[0] = 0
}
batch.logits[i] = 0
}
// llama_decode will output logits only for the last token of the prompt
batch.logits[Int(batch.n_tokens) - 1] = 1
if llama_decode(context, batch) != 0 {
print("llama_decode() failed")
exit(1)
}
for i in 1 ..< n_parallel {
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
}
if n_parallel > 1 {
print("generating \(n_parallel) sequences ...\n")
}
var streams: [String] = .init(repeating: "", count: n_parallel)
var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
var n_cur = batch.n_tokens
var n_decode = 0
let t_main_start = ggml_time_us()
while n_cur <= n_len {
// prepare the next batch
batch.n_tokens = 0
// sample the next token for each parallel sequence / stream
for i in 0 ..< n_parallel {
if i_batch[i] < 0 {
// the stream has already finished
continue
}
var n_vocab = llama_n_vocab(model)
var logits = llama_get_logits_ith(context, i_batch[i])
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
for token_id in 0 ..< n_vocab {
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
}
var candidates_p: llama_token_data_array = .init(
data: &candidates,
size: candidates.count,
sorted: false
)
let top_k: Int32 = 40
let top_p: Float = 0.9
let temp: Float = 0.4
llama_sample_top_k(context, &candidates_p, top_k, 1)
llama_sample_top_p(context, &candidates_p, top_p, 1)
llama_sample_temp(context, &candidates_p, temp)
let new_token_id = llama_sample_token(context, &candidates_p)
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
if new_token_id == llama_token_eos(model) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {
print("stream \(i) finished at n_cur = \(n_cur)")
}
continue
}
let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
// if there is only one stream, we print immediately to stdout
if n_parallel == 1 {
print(nextStringPiece, terminator: "")
}
streams[i] += nextStringPiece
// push this new token for next evaluation
batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.n_seq_id[Int(batch.n_tokens)] = 1
if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
seq_id[0] = Int32(i)
}
batch.logits[Int(batch.n_tokens)] = 1
i_batch[i] = batch.n_tokens
batch.n_tokens += 1
n_decode += 1
}
// all streams are finished
if batch.n_tokens == 0 {
break
}
n_cur += 1
// evaluate the current batch with the transformer model
if llama_decode(context, batch) != 0 {
print("llama_decode() failed")
exit(1)
}
}
if n_parallel > 1 {
print("\n")
for (i, stream) in streams.enumerated() {
print("sequence \(i):\n\n\(prompt)\(stream)\n")
}
}
let t_main_end = ggml_time_us()
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
llama_print_timings(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)])
}
tokens.deallocate()
return swiftTokens
}
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
if nTokens < 0 {
let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount)
let check = llama_token_to_piece(
model,
token,
&result,
Int32(result.count)
)
assert(check == actualTokensCount)
} else {
result.removeLast(result.count - Int(nTokens))
}
if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
return utfString
} else {
buffer.append(contentsOf: result)
let data = Data(buffer.map { UInt8(bitPattern: $0) })
if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
buffer = []
}
guard let bufferString = String(data: data, encoding: .utf8) else {
return nil
}
buffer = []
return bufferString
}
}

View file

@ -11,12 +11,19 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (argc == 1 || argv[1][0] == '-') { if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]); printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
return 1 ; return 1 ;
} }
// number of parallel batches
int n_parallel = 1; int n_parallel = 1;
// total length of the sequences including the prompt
int n_len = 32;
// number of layers to offload to the GPU
int n_gpu_layers = 0;
if (argc >= 2) { if (argc >= 2) {
params.model = argv[1]; params.model = argv[1];
} }
@ -29,13 +36,18 @@ int main(int argc, char ** argv) {
n_parallel = std::atoi(argv[3]); n_parallel = std::atoi(argv[3]);
} }
if (argc >= 5) {
n_len = std::atoi(argv[4]);
}
if (argc >= 6) {
n_gpu_layers = std::atoi(argv[5]);
}
if (params.prompt.empty()) { if (params.prompt.empty()) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
} }
// total length of the sequences including the prompt
const int n_len = 32;
// init LLM // init LLM
llama_backend_init(params.numa); llama_backend_init(params.numa);
@ -44,7 +56,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
// model_params.n_gpu_layers = 99; // offload all layers to the GPU model_params.n_gpu_layers = n_gpu_layers;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -66,7 +78,7 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234; ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req; ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel); ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_threads = params.n_threads; ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@ -97,20 +109,15 @@ int main(int argc, char ** argv) {
fflush(stderr); fflush(stderr);
// create a llama_batch with size 512 // create a llama_batch
// we use this object to submit token data for decoding // we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
// evaluate the initial prompt // evaluate the initial prompt
batch.n_tokens = tokens_list.size(); for (size_t i = 0; i < tokens_list.size(); ++i) {
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
for (int32_t i = 0; i < batch.n_tokens; i++) {
batch.token[i] = tokens_list[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
} }
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
// llama_decode will output logits only for the last token of the prompt // llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true; batch.logits[batch.n_tokens - 1] = true;
@ -146,7 +153,7 @@ int main(int argc, char ** argv) {
while (n_cur <= n_len) { while (n_cur <= n_len) {
// prepare the next batch // prepare the next batch
batch.n_tokens = 0; llama_batch_clear(batch);
// sample the next token for each parallel sequence / stream // sample the next token for each parallel sequence / stream
for (int32_t i = 0; i < n_parallel; ++i) { for (int32_t i = 0; i < n_parallel; ++i) {
@ -180,7 +187,7 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished // is it an end of stream? -> mark the stream as finished
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) { if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
i_batch[i] = -1; i_batch[i] = -1;
LOG_TEE("\n"); LOG_TEE("\n");
if (n_parallel > 1) { if (n_parallel > 1) {
@ -198,15 +205,10 @@ int main(int argc, char ** argv) {
streams[i] += llama_token_to_piece(ctx, new_token_id); streams[i] += llama_token_to_piece(ctx, new_token_id);
// push this new token for next evaluation
batch.token [batch.n_tokens] = new_token_id;
batch.pos [batch.n_tokens] = n_cur;
batch.seq_id[batch.n_tokens] = i;
batch.logits[batch.n_tokens] = true;
i_batch[i] = batch.n_tokens; i_batch[i] = batch.n_tokens;
batch.n_tokens += 1; // push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_cur, { i }, true);
n_decode += 1; n_decode += 1;
} }

View file

@ -47,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc. // For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
} }
// Function matching type llama_beam_search_callback_fn_t. // Function matching type llama_beam_search_callback_fn_t.

View file

@ -1,9 +1,6 @@
set(TARGET benchmark) set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp) add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,4 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
@ -130,13 +129,13 @@ int main(int argc, char ** argv) {
const ggml_type qtype = GGML_TYPE_Q4_1; const ggml_type qtype = GGML_TYPE_Q4_1;
size_t ctx_size = 0; size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
ctx_size += sizex*sizey*ggml_type_sizef(qtype); ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += sizex*sizey*ggml_type_sizef(qtype); ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += 1024*1024*16; ctx_size += 1024*1024*16;
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
@ -172,7 +171,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
// printf("Creating compute graph\n"); // printf("Creating compute graph\n");
struct ggml_cgraph gf = ggml_build_forward(m11xm2); struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, m11xm2);
printf("n_threads=%i\n", benchmark_params.n_threads); printf("n_threads=%i\n", benchmark_params.n_threads);
@ -181,9 +181,9 @@ int main(int argc, char ** argv) {
std::vector<uint8_t> work_buffer; std::vector<uint8_t> work_buffer;
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads); ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
TENSOR_DUMP(gf.nodes[0]); TENSOR_DUMP(gf->nodes[0]);
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
@ -201,7 +201,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
// printf("Creating compute graph\n"); // printf("Creating compute graph\n");
struct ggml_cgraph gf31 = ggml_build_forward(q31); struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
ggml_build_forward_expand(gf31, q31);
// Set up a second graph computation to make sure we override the CPU cache lines // Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n"); // printf("Creating new tensor q12 & Running quantize\n");
@ -212,7 +213,8 @@ int main(int argc, char ** argv) {
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
//printf("Creating compute graph\n"); //printf("Creating compute graph\n");
struct ggml_cgraph gf32 = ggml_build_forward(q32); struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
ggml_build_forward_expand(gf32, q32);
printf("n_threads=%i\n", benchmark_params.n_threads); printf("n_threads=%i\n", benchmark_params.n_threads);
const int dimx = sizex; const int dimx = sizex;
@ -224,7 +226,7 @@ int main(int argc, char ** argv) {
// Let's use the F32 result from above as a reference for the quantized multiplication // Let's use the F32 result from above as a reference for the quantized multiplication
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n"); printf("=====================================================================================\n");
@ -234,7 +236,7 @@ int main(int argc, char ** argv) {
long long int start = ggml_time_us(); long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n"); //printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads); ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
long long int stop = ggml_time_us(); long long int stop = ggml_time_us();
long long int usec = stop-start; long long int usec = stop-start;
@ -252,7 +254,7 @@ int main(int argc, char ** argv) {
// Check that the matrix multiplication result is in the right ballpark // Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]); float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
@ -267,7 +269,7 @@ int main(int argc, char ** argv) {
} }
// Running a different graph computation to make sure we override the CPU cache lines // Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads); ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
} }
printf("\n"); printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));

View file

@ -9,7 +9,7 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
exit 1 exit 1
fi fi
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}" MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}" PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
USER_NAME="${USER_NAME:-User}" USER_NAME="${USER_NAME:-User}"
AI_NAME="${AI_NAME:-ChatLLaMa}" AI_NAME="${AI_NAME:-ChatLLaMa}"
@ -61,9 +61,9 @@ fi
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
echo 'Prompt cache does not exist, building...' echo 'Prompt cache does not exist, building...'
# Default batch_size to 8 here for better user feedback during initial prompt processing # Default batch_size to 64 here for better user feedback during initial prompt processing
./main 2>>"$LOG" \ ./main 2>>"$LOG" \
--batch_size 8 \ --batch_size 64 \
"${OPTS[@]}" \ "${OPTS[@]}" \
--prompt-cache "$PROMPT_CACHE_FILE" \ --prompt-cache "$PROMPT_CACHE_FILE" \
--file "$CUR_PROMPT_FILE" \ --file "$CUR_PROMPT_FILE" \
@ -132,7 +132,7 @@ while read -e line; do
# HACK get num tokens from debug message # HACK get num tokens from debug message
# TODO get both messages in one go # TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
echo >&2 "Couldn't get number of tokens from ./main output!" echo >&2 "Couldn't get number of tokens from ./main output!"
exit 1 exit 1
fi fi

View file

@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
} }
static void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(ggml_is_matrix(probs));
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i); float p = get_f32_2d(probs, k, i);
@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
if (file.size < 4) { if (file.size < 4) {
return false; return false;
} }
uint32_t magic = file.read_u32(); std::string magic = file.read_string(4);
return magic == GGUF_MAGIC; return magic == GGUF_MAGIC;
} }
@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct; int ct;
switch (gg_weights->n_dims){ switch (ggml_n_dims(gg_weights)) {
case 1: case 1:
ct = 0; ct = 0;
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){

View file

@ -1,4 +0,0 @@
PandaGPT
MiniGPT-4
*.pth

View file

@ -1,17 +0,0 @@
set(TARGET embdinput)
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
set(TARGET embd-input-test)
add_executable(${TARGET} embd-input-test.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,63 +0,0 @@
### Examples for input embedding directly
## Requirement
build `libembdinput.so`
run the following comman in main dir (../../).
```
make
```
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
2. Convert it to ggml format.
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
```
import torch
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
pth_path = "./examples/embd-input/llava_projection.pth"
dic = torch.load(bin_path)
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
torch.save({k: dic[k] for k in used_key}, pth_path)
```
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
The `adapter_config.json` is
```
{
"peft_type": "LORA",
"fan_in_fan_out": false,
"bias": null,
"modules_to_save": null,
"r": 32,
"lora_alpha": 32,
"lora_dropout": 0.1,
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
}
```
2. Papare the `vicuna` v0 model.
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
4. Clone the PandaGPT source.
```
git clone https://github.com/yxuansu/PandaGPT
```
5. Install the requirement of PandaGPT.
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
2. Clone the MiniGPT-4 source.
```
git clone https://github.com/Vision-CAIR/MiniGPT-4/
```
3. Install the requirement of PandaGPT.
4. Papare the `vicuna` v0 model.
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.

View file

@ -1,220 +0,0 @@
#include "build-info.h"
#include "common.h"
#include "embd-input.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
static llama_context ** g_ctx;
extern "C" {
struct MyModel* create_mymodel(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return nullptr;
}
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = uint32_t(time(NULL));
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return nullptr;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
struct MyModel * ret = new MyModel();
ret->ctx = ctx;
ret->params = params;
ret->n_past = 0;
// printf("ctx: %d\n", ret->ctx);
return ret;
}
void free_mymodel(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
llama_print_timings(ctx);
llama_free(ctx);
delete mymodel;
}
bool eval_float(void * model, float * input, int N){
MyModel * mymodel = (MyModel*)model;
llama_context * ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_emb = llama_n_embd(llama_get_model(ctx));
int n_past = mymodel->n_past;
int n_batch = N; // params.n_batch;
for (int i = 0; i < (int) N; i += n_batch) {
int n_eval = (int) N - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
MyModel * mymodel = (MyModel* )model;
llama_context * ctx;
ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_past = mymodel->n_past;
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_id(struct MyModel* mymodel, int id) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(mymodel, tokens);
}
bool eval_string(struct MyModel * mymodel,const char* str){
llama_context * ctx = mymodel->ctx;
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
eval_tokens(mymodel, embd_inp);
return true;
}
llama_token sampling_id(struct MyModel* mymodel) {
llama_context* ctx = mymodel->ctx;
gpt_params params = mymodel->params;
// int n_ctx = llama_n_ctx(ctx);
// out of user input, sample next token
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
// const float repeat_penalty = params.repeat_penalty;
// const float alpha_presence = params.presence_penalty;
// const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
// const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
}
}
}
return id;
}
const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
static std::string ret;
if (id == llama_token_eos(ctx)) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
}
}

View file

@ -1,35 +0,0 @@
#include "embd-input.h"
#include <stdlib.h>
#include <random>
#include <string.h>
int main(int argc, char** argv) {
auto mymodel = create_mymodel(argc, argv);
int N = 10;
int max_tgt_len = 500;
int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
// add random float embd to test evaluation
float * data = new float[N*n_embd];
std::default_random_engine e;
std::uniform_real_distribution<float> u(0,1);
for (int i=0;i<N*n_embd;i++) {
data[i] = u(e);
}
eval_string(mymodel, "user: what is the color of the flag of UN?");
eval_float(mymodel, data, N);
eval_string(mymodel, "assistant:");
eval_string(mymodel, mymodel->params.prompt.c_str());
const char* tmp;
for (int i=0; i<max_tgt_len; i++) {
tmp = sampling(mymodel);
if (strcmp(tmp, "</s>")==0) break;
printf("%s", tmp);
fflush(stdout);
}
printf("\n");
free_mymodel(mymodel);
return 0;
}

View file

@ -1,27 +0,0 @@
#ifndef _EMBD_INPUT_H_
#define _EMBD_INPUT_H_ 1
#include "common.h"
#include "llama.h"
extern "C" {
typedef struct MyModel {
llama_context* ctx;
gpt_params params;
int n_past = 0;
} MyModel;
struct MyModel* create_mymodel(int argc, char ** argv);
bool eval_float(void* model, float* input, int N);
bool eval_tokens(void* model, std::vector<llama_token> tokens);
bool eval_id(struct MyModel* mymodel, int id);
bool eval_string(struct MyModel* mymodel, const char* str);
const char * sampling(struct MyModel* mymodel);
llama_token sampling_id(struct MyModel* mymodel);
void free_mymodel(struct MyModel* mymodel);
}
#endif

View file

@ -1,72 +0,0 @@
#!/usr/bin/env python3
import ctypes
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
import numpy as np
import os
libc = cdll.LoadLibrary("./libembdinput.so")
libc.sampling.restype=c_char_p
libc.create_mymodel.restype=c_void_p
libc.eval_string.argtypes=[c_void_p, c_char_p]
libc.sampling.argtypes=[c_void_p]
libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
class MyModel:
def __init__(self, args):
argc = len(args)
c_str = [c_char_p(i.encode()) for i in args]
args_c = (c_char_p * argc)(*c_str)
self.model = c_void_p(libc.create_mymodel(argc, args_c))
self.max_tgt_len = 512
self.print_string_eval = True
def __del__(self):
libc.free_mymodel(self.model)
def eval_float(self, x):
libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
def eval_string(self, x):
libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
if self.print_string_eval:
print(x)
def eval_token(self, x):
libc.eval_id(self.model, x)
def sampling(self):
s = libc.sampling(self.model)
return s
def stream_generate(self, end="</s>"):
ret = b""
end = end.encode()
for _ in range(self.max_tgt_len):
tmp = self.sampling()
ret += tmp
yield tmp
if ret.endswith(end):
break
def generate_with_print(self, end="</s>"):
ret = b""
for i in self.stream_generate(end=end):
ret += i
print(i.decode(errors="replace"), end="", flush=True)
print("")
return ret.decode(errors="replace")
def generate(self, end="</s>"):
text = b"".join(self.stream_generate(end=end))
return text.decode(errors="replace")
if __name__ == "__main__":
model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
model.eval_string("""user: what is the color of the flag of UN?""")
x = np.random.random((5120,10))# , dtype=np.float32)
model.eval_float(x)
model.eval_string("""assistant:""")
for i in model.generate():
print(i.decode(errors="replace"), end="", flush=True)

View file

@ -1,71 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from transformers import CLIPVisionModel, CLIPImageProcessor
from PIL import Image
# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
vision_tower = "openai/clip-vit-large-patch14"
select_hidden_state_layer = -2
# (vision_config.image_size // vision_config.patch_size) ** 2
image_token_len = (224//14)**2
class Llava:
def __init__(self, args):
self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
self.mm_projector = nn.Linear(1024, 5120)
self.model = MyModel(["main", *args])
def load_projection(self, path):
state = torch.load(path)
self.mm_projector.load_state_dict({
"weight": state["model.mm_projector.weight"],
"bias": state["model.mm_projector.bias"]})
def chat(self, question):
self.model.eval_string("user: ")
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
image_feature = select_hidden_state[:, 1:]
embd_image = self.mm_projector(image_feature)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("user: ")
self.model.eval_token(32003-2) # im_start
self.model.eval_float(embd_image.T)
for i in range(image_token_len-embd_image.shape[0]):
self.model.eval_token(32003-3) # im_patch
self.model.eval_token(32003-1) # im_end
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
if __name__=="__main__":
# model form liuhaotian/LLaVA-13b-delta-v1-1
a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
# Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
# Also here can use pytorch_model-00003-of-00003.bin directly.
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"llava_projection.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
respose
a.chat("what is the color of it?")

View file

@ -1,129 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from PIL import Image
minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
sys.path.insert(0, minigpt4_path)
from minigpt4.models.blip2 import Blip2Base
from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
class MiniGPT4(Blip2Base):
"""
MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
"""
def __init__(self,
args,
vit_model="eva_clip_g",
q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
img_size=224,
drop_path_rate=0,
use_grad_checkpoint=False,
vit_precision="fp32",
freeze_vit=True,
freeze_qformer=True,
num_query_token=32,
llama_model="",
prompt_path="",
prompt_template="",
max_txt_len=32,
end_sym='\n',
low_resource=False, # use 8 bit and put vit in cpu
device_8bit=0
):
super().__init__()
self.img_size = img_size
self.low_resource = low_resource
self.preprocessor = Blip2ImageEvalProcessor(img_size)
print('Loading VIT')
self.visual_encoder, self.ln_vision = self.init_vision_encoder(
vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
)
print('Loading VIT Done')
print('Loading Q-Former')
self.Qformer, self.query_tokens = self.init_Qformer(
num_query_token, self.visual_encoder.num_features
)
self.Qformer.cls = None
self.Qformer.bert.embeddings.word_embeddings = None
self.Qformer.bert.embeddings.position_embeddings = None
for layer in self.Qformer.bert.encoder.layer:
layer.output = None
layer.intermediate = None
self.load_from_pretrained(url_or_filename=q_former_model)
print('Loading Q-Former Done')
self.llama_proj = nn.Linear(
self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
)
self.max_txt_len = max_txt_len
self.end_sym = end_sym
self.model = MyModel(["main", *args])
# system prompt
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
"You will be able to see the image once I provide it to you. Please answer my questions."
"###")
def encode_img(self, image):
image = self.preprocessor(image)
image = image.unsqueeze(0)
device = image.device
if self.low_resource:
self.vit_to_cpu()
image = image.to("cpu")
with self.maybe_autocast():
image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
query_output = self.Qformer.bert(
query_embeds=query_tokens,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=True,
)
inputs_llama = self.llama_proj(query_output.last_hidden_state)
# atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
return inputs_llama
def load_projection(self, path):
state = torch.load(path)["model"]
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def chat(self, question):
self.model.eval_string("Human: ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.encode_img(image)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("Human: <Img>")
self.model.eval_float(embd_image.T)
self.model.eval_string("</Img> ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
if __name__=="__main__":
a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"pretrained_minigpt4.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
a.chat("what is the color of it?")

View file

@ -1,99 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
# use PandaGPT path
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
imagebind_ckpt_path = "./models/panda_gpt/"
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
from ImageBind.models import imagebind_model
from ImageBind import data
ModalityType = imagebind_model.ModalityType
max_tgt_len = 400
class PandaGPT:
def __init__(self, args):
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
self.visual_encoder.eval()
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
self.max_tgt_len = max_tgt_len
self.model = MyModel(["main", *args])
self.generated_text = ""
self.device = "cpu"
def load_projection(self, path):
state = torch.load(path, map_location="cpu")
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def eval_inputs(self, inputs):
self.model.eval_string("<Img>")
embds = self.extract_multimoal_feature(inputs)
for i in embds:
self.model.eval_float(i.T)
self.model.eval_string("</Img> ")
def chat(self, question):
return self.chat_with_image(None, question)
def chat_with_image(self, inputs, question):
if self.generated_text == "":
self.model.eval_string("###")
self.model.eval_string(" Human: ")
if inputs:
self.eval_inputs(inputs)
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
ret = self.model.generate_with_print(end="###")
self.generated_text += ret
return ret
def extract_multimoal_feature(self, inputs):
features = []
for key in ["image", "audio", "video", "thermal"]:
if key + "_paths" in inputs:
embeds = self.encode_data(key, inputs[key+"_paths"])
features.append(embeds)
return features
def encode_data(self, data_type, data_paths):
type_map = {
"image": ModalityType.VISION,
"audio": ModalityType.AUDIO,
"video": ModalityType.VISION,
"thermal": ModalityType.THERMAL,
}
load_map = {
"image": data.load_and_transform_vision_data,
"audio": data.load_and_transform_audio_data,
"video": data.load_and_transform_video_data,
"thermal": data.load_and_transform_thermal_data
}
load_function = load_map[data_type]
key = type_map[data_type]
inputs = {key: load_function(data_paths, self.device)}
with torch.no_grad():
embeddings = self.visual_encoder(inputs)
embeds = embeddings[key]
embeds = self.llama_proj(embeds).cpu().numpy()
return embeds
if __name__=="__main__":
a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
a.load_projection("./models/panda_gpt/adapter_model.bin")
a.chat_with_image(
{"image_paths": ["./media/llama1-logo.png"]},
"what is the text in the picture? 'llama' or 'lambda'?")
a.chat("what is the color of it?")

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,4 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -240,7 +240,7 @@ static struct lora_data * load_lora(struct lora_info * info) {
} }
struct ggml_init_params params_ggml; struct ggml_init_params params_ggml;
params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES; params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
params_ggml.mem_buffer = NULL; params_ggml.mem_buffer = NULL;
params_ggml.no_alloc = true; params_ggml.no_alloc = true;
result->ctx = ggml_init(params_ggml); result->ctx = ggml_init(params_ggml);
@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r; float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
struct ggml_init_params params; struct ggml_init_params params;
params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
params.mem_buffer = NULL; params.mem_buffer = NULL;
params.no_alloc = true; params.no_alloc = true;
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;

View file

@ -21,7 +21,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin ./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
``` ```
Finetune output files will be saved every N iterations (config with `--save-every N`). **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output. The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
So in above example after 10 iterations these files will be written: So in above example after 10 iterations these files will be written:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf - chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
``` ```
The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values. The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime. Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`. If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.

View file

@ -3,9 +3,7 @@
import argparse import argparse
import gguf import gguf
import os
import struct import struct
import sys
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
@ -313,7 +311,7 @@ class ModelParams:
gguf_writer.add_feed_forward_length(self.get_n_ff()) gguf_writer.add_feed_forward_length(self.get_n_ff())
def tensor_name(key, bid=None, suffix=".weight"): def tensor_name(key, bid=None, suffix=".weight"):
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
class Layer: class Layer:
def __init__(self, params, lora_params, bid): def __init__(self, params, lora_params, bid):

View file

@ -332,8 +332,8 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
assert_shape_1d(layer.attention_norm, hparams.n_embd); assert_shape_1d(layer.attention_norm, hparams.n_embd);
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd); assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa());
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd); assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
assert_shape_1d(layer.ffn_norm, hparams.n_embd); assert_shape_1d(layer.ffn_norm, hparams.n_embd);
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
set_param_lora(lora); set_param_lora(lora);
// measure data size // measure data size
struct ggml_allocr * alloc = NULL; size_t size = 0;
alloc = ggml_allocr_new_measure(tensor_alignment); for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
alloc_lora(alloc, lora); size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
}
// allocate data // allocate data
lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment); struct ggml_allocr * alloc = NULL;
ggml_allocr_free(alloc); lora->data.resize(size + tensor_alignment);
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment); alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
alloc_lora(alloc, lora); alloc_lora(alloc, lora);
ggml_allocr_free(alloc); ggml_allocr_free(alloc);
@ -547,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
randomize_tensor_normal(lora->tok_embeddings_a, rnd); randomize_tensor_normal(lora->tok_embeddings_a, rnd);
randomize_tensor_normal(lora->tok_embeddings_b, rnd); ggml_set_zero(lora->tok_embeddings_b);
randomize_tensor_normal(lora->norm_a, rnd); randomize_tensor_normal(lora->norm_a, rnd);
randomize_tensor_normal(lora->norm_b, rnd); ggml_set_zero(lora->norm_b);
randomize_tensor_normal(lora->output_a, rnd); randomize_tensor_normal(lora->output_a, rnd);
randomize_tensor_normal(lora->output_b, rnd); ggml_set_zero(lora->output_b);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = lora->layers[i]; auto & layer = lora->layers[i];
randomize_tensor_normal(layer.attention_norm_a, rnd); randomize_tensor_normal(layer.attention_norm_a, rnd);
randomize_tensor_normal(layer.attention_norm_b, rnd); ggml_set_zero(layer.attention_norm_b);
randomize_tensor_normal(layer.wq_a, rnd); randomize_tensor_normal(layer.wq_a, rnd);
randomize_tensor_normal(layer.wq_b, rnd); ggml_set_zero(layer.wq_b);
randomize_tensor_normal(layer.wk_a, rnd); randomize_tensor_normal(layer.wk_a, rnd);
randomize_tensor_normal(layer.wk_b, rnd); ggml_set_zero(layer.wk_b);
randomize_tensor_normal(layer.wv_a, rnd); randomize_tensor_normal(layer.wv_a, rnd);
randomize_tensor_normal(layer.wv_b, rnd); ggml_set_zero(layer.wv_b);
randomize_tensor_normal(layer.wo_a, rnd); randomize_tensor_normal(layer.wo_a, rnd);
randomize_tensor_normal(layer.wo_b, rnd); ggml_set_zero(layer.wo_b);
randomize_tensor_normal(layer.ffn_norm_a, rnd); randomize_tensor_normal(layer.ffn_norm_a, rnd);
randomize_tensor_normal(layer.ffn_norm_b, rnd); ggml_set_zero(layer.ffn_norm_b);
randomize_tensor_normal(layer.w1_a, rnd); randomize_tensor_normal(layer.w1_a, rnd);
randomize_tensor_normal(layer.w1_b, rnd); ggml_set_zero(layer.w1_b);
randomize_tensor_normal(layer.w2_a, rnd); randomize_tensor_normal(layer.w2_a, rnd);
randomize_tensor_normal(layer.w2_b, rnd); ggml_set_zero(layer.w2_b);
randomize_tensor_normal(layer.w3_a, rnd); randomize_tensor_normal(layer.w3_a, rnd);
randomize_tensor_normal(layer.w3_b, rnd); ggml_set_zero(layer.w3_b);
} }
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
@ -626,7 +627,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// KQ_pos - contains the positions // KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
{ ggml_allocr_alloc(alloc, KQ_pos);
if (!ggml_allocr_is_measure(alloc)) {
int * data = (int *) KQ_pos->data; int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
data[i] = n_past + i; data[i] = n_past + i;
@ -640,8 +642,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_custom(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale); rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
}; };
set_name(tokens_input, "tokens_input"); set_name(tokens_input, "tokens_input");
@ -650,7 +653,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
if (ggml_is_quantized(a->type)) { if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
} else if (a->type == GGML_TYPE_F32) { } else if (a->type == GGML_TYPE_F32) {
return ggml_add(ctx, a, b); return ggml_add(ctx, a, b);
@ -769,7 +772,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
if (enable_checkpointing) { if (enable_checkpointing) {
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
} else { } else {
*gb = *gf; ggml_graph_cpy(gf, gb);
ggml_build_backward_expand(ctx, gf, gb, true); ggml_build_backward_expand(ctx, gf, gb, true);
} }
@ -786,6 +789,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
ggml_allocr_alloc(alloc, t36->grad); ggml_allocr_alloc(alloc, t36->grad);
// KQ_pos
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
// make sure base model tensors data cannot be used in viewable operations // make sure base model tensors data cannot be used in viewable operations
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
@ -1105,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
name = ggml_get_name(tensor); name = ggml_get_name(tensor);
} }
uint32_t name_len = strlen(name); uint32_t name_len = strlen(name);
uint32_t nd = tensor->n_dims; uint32_t nd = ggml_n_dims(tensor);
uint32_t ne[4] = { (uint32_t)tensor->ne[0], uint32_t ne[4] = { (uint32_t)tensor->ne[0],
(uint32_t)tensor->ne[1], (uint32_t)tensor->ne[1],
(uint32_t)tensor->ne[2], (uint32_t)tensor->ne[2],
@ -1541,6 +1546,7 @@ int main(int argc, char ** argv) {
srand(params.common.seed); srand(params.common.seed);
struct llama_model_params llama_mparams = llama_model_default_params(); struct llama_model_params llama_mparams = llama_model_default_params();
llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
llama_mparams.vocab_only = false; llama_mparams.vocab_only = false;
printf("%s: model base = '%s'\n", __func__, params.fn_model_base); printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
@ -1598,6 +1604,7 @@ int main(int argc, char ** argv) {
opt->params = ggml_opt_default_params(GGML_OPT_ADAM); opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
opt->params.print_forward_graph = false; opt->params.print_forward_graph = false;
opt->params.print_backward_graph = false; opt->params.print_backward_graph = false;
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
opt->params.n_threads = params.common.n_threads; opt->params.n_threads = params.common.n_threads;
opt->params.past = params.common.opt_past; opt->params.past = params.common.opt_past;
opt->params.delta = params.common.opt_delta; opt->params.delta = params.common.opt_delta;
@ -1711,11 +1718,9 @@ int main(int argc, char ** argv) {
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
// measure required memory for input tensors // measure required memory for input tensors
alloc = ggml_allocr_new_measure(tensor_alignment); size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
ggml_allocr_alloc(alloc, tokens_input); GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
ggml_allocr_alloc(alloc, target_probs); tensor_alignment;
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
ggml_allocr_free(alloc);
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
// allocate input tensors // allocate input tensors
@ -1726,11 +1731,9 @@ int main(int argc, char ** argv) {
ggml_allocr_free(alloc); ggml_allocr_free(alloc);
// context for compute tensors without their data // context for compute tensors without their data
size_t estimated_compute_size_wo_data = ( const size_t estimated_compute_size_wo_data = (
ggml_tensor_overhead()*GGML_MAX_NODES*2 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*( (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
params.common.use_checkpointing ? 3 : 2
)
); );
struct ggml_init_params ctx_compute_params = { struct ggml_init_params ctx_compute_params = {
estimated_compute_size_wo_data, // mem_size estimated_compute_size_wo_data, // mem_size
@ -1753,11 +1756,11 @@ int main(int argc, char ** argv) {
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new_measure(tensor_alignment); alloc = ggml_allocr_new_measure(tensor_alignment);
gf = ggml_new_graph(ctx_compute); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = (enum ggml_cgraph_eval_order) order; gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph(ctx_compute); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gb_tmp = params.common.use_checkpointing gb_tmp = params.common.use_checkpointing
? ggml_new_graph(ctx_compute) ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
: NULL; : NULL;
loss = llama_build_lora_finetune_graphs( loss = llama_build_lora_finetune_graphs(
&model, &lora, alloc, ctx_compute, &model, &lora, alloc, ctx_compute,
@ -1786,11 +1789,11 @@ int main(int argc, char ** argv) {
mem_compute_data.resize(max_compute_size); mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params); ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
gf = ggml_new_graph(ctx_compute); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf->order = best_order; gf->order = best_order;
gb = ggml_new_graph(ctx_compute); gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gb_tmp = params.common.use_checkpointing gb_tmp = params.common.use_checkpointing
? ggml_new_graph(ctx_compute) ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
: NULL; : NULL;
loss = llama_build_lora_finetune_graphs( loss = llama_build_lora_finetune_graphs(
&model, &lora, alloc, ctx_compute, &model, &lora, alloc, ctx_compute,

View file

@ -0,0 +1,34 @@
#!/bin/bash
cd `dirname $0`
cd ../..
EXE="./finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
while getopts "dg" opt; do
case $opt in
d)
DEBUGGER="gdb --args"
;;
g)
EXE="./build/bin/Release/finetune"
GPUARG="--gpu-layers 25"
;;
esac
done
$DEBUGGER $EXE \
--model-base $MODEL \
$GPUARG \
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
--save-every 10 \
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing

View file

@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
// print first 10 elements // print first 10 elements
const float * data = (const float *) cur->data; const float * data = (const float *) cur->data;

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,5 @@
set(TARGET infill)
add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

41
examples/infill/README.md Normal file
View file

@ -0,0 +1,41 @@
# llama.cpp/example/infill
This example shows how to use the infill mode with Code Llama models supporting infill mode.
Currently the 7B and 13B models support infill mode.
Infill supports most of the options available in the main example.
For further information have a look at the main README.md in llama.cpp/example/main/README.md
## Common Options
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
## Input Prompts
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
## Interaction
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
### Interaction Options
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
### Example
```bash
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
```

765
examples/infill/infill.cpp Normal file
View file

@ -0,0 +1,765 @@
#include "common.h"
#include "console.h"
#include "llama.h"
#include "grammar-parser.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static llama_context ** g_ctx;
static llama_model ** g_model;
static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens
) {
if (params.logdir.empty()) {
return;
}
const std::string timestamp = get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
const std::string logfile_path = params.logdir + timestamp + ".yml";
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
fprintf(logfile, "binary: infill\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
fprintf(logfile, "# Generation Results #\n");
fprintf(logfile, "######################\n");
fprintf(logfile, "\n");
dump_string_yaml_multiline(logfile, "output", output.c_str());
dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile);
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void sigint_handler(int signo) {
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting = true;
} else {
console::cleanup();
printf("\n");
llama_print_timings(*g_ctx);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
_exit(130);
}
}
}
#endif
int main(int argc, char ** argv) {
gpt_params params;
llama_sampling_params & sparams = params.sparams;
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("infill", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
#endif // LOG_DISABLE_LOGS
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (params.instruct) {
printf("\n************\n");
printf("%s: please use the 'main' tool for instruct mode\n", __func__);
printf("************\n\n");
return 0;
}
if (params.chatml) {
printf("\n************\n");
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.antiprompt.empty()) {
printf("\n************\n");
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
printf("************\n\n");
return 0;
}
if (params.random_prompt) {
printf("\n************\n");
printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.path_prompt_cache.empty()) {
printf("\n************\n");
printf("%s: infill does not support prompt caching\n", __func__);
printf("************\n\n");
return 0;
}
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
LOG("%s: llama backend init\n", __func__);
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
llama_context * ctx_guidance = NULL;
g_model = &model;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (sparams.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
ctx_guidance = llama_new_context_with_model(model, lparams);
}
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
}
// print system information
{
LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str());
}
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
std::vector<llama_token> embd_inp;
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
const int space_token = 29871;
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
// Tokenize negative prompt
std::vector<llama_token> guidance_inp;
int guidance_offset = 0;
int original_prompt_len = 0;
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
LOG("guidance_offset: %s", log_tostr(guidance_offset));
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
params.n_keep = (int)embd_inp.size();
}
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
params.interactive = true;
}
if (params.verbose_prompt) {
LOG_TEE("\n");
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (ctx_guidance) {
LOG_TEE("\n");
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
}
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
LOG_TEE("'\n");
}
LOG_TEE("\n");
}
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
LOG_TEE("%s: interactive mode on.\n", __func__);
if (params.input_prefix_bos) {
LOG_TEE("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
}
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
LOG_TEE("\n##### Infill mode #####\n\n");
if (params.infill) {
printf("\n************\n");
printf("no need to specify '--infill', always running infill\n");
printf("************\n\n");
}
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
control_message = " - Press Return to return control to LLaMa.\n"
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
LOG_TEE("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
#endif
LOG_TEE( "%s\n", control_message);
is_interacting = params.interactive_first;
}
bool input_echo = true;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
int n_past_guidance = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while (n_remain != 0 || params.interactive) {
// predict
if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int) embd.size() > max_embd_size) {
const int skipped_tokens = (int) embd.size() - max_embd_size;
embd.resize(max_embd_size);
console::set_display(console::error);
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
fflush(stdout);
}
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
n_past -= n_discard;
if (ctx_guidance) {
n_past_guidance -= n_discard;
}
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
}
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
if (ctx_guidance) {
int input_size = 0;
llama_token * input_buf = NULL;
if (n_past_guidance < (int) guidance_inp.size()) {
// Guidance context should have the same data with these modifications:
//
// * Replace the initial prompt
// * Shift everything by guidance_offset
embd_guidance = guidance_inp;
if (embd.begin() + original_prompt_len < embd.end()) {
embd_guidance.insert(
embd_guidance.end(),
embd.begin() + original_prompt_len,
embd.end()
);
}
input_buf = embd_guidance.data();
input_size = embd_guidance.size();
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
} else {
input_buf = embd.data();
input_size = embd.size();
}
for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch);
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
n_past_guidance += n_eval;
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
LOG("n_past = %d\n", n_past);
}
}
embd.clear();
embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
llama_sampling_accept(ctx_sampling, ctx, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
embd.push_back(id);
// echo this to console
input_echo = true;
// decrement remaining sampling budget
--n_remain;
LOG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
}
}
// display text
if (input_echo) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
if (embd.size() > 1) {
input_tokens.push_back(id);
} else {
output_tokens.push_back(id);
output_ss << token_str;
}
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
}
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if(is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
fflush(stdout);
printf("\n");
console::set_display(console::user_input);
std::string buffer;
std::string line;
bool another_line=true;
// set a new prefix via stdin
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// check if we got an empty line, if so we use the old input
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_prefix = buffer;
}
buffer.clear();
// set a new suffix via stdin
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// check if we got an empty line
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_suffix = buffer;
}
buffer.clear();
// done taking input, reset color
console::set_display(console::reset);
if (params.escape) {
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
process_escapes(params.input_prefix);
process_escapes(params.input_suffix);
}
suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
// tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
embd.clear();
embd_guidance.clear();
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
// LOG_TEE("took new input\n");
is_interacting = false;
}
// deal with end of text token in interactive mode
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
LOG("found EOS token\n");
if (params.interactive) {
is_interacting = true;
printf("\n");
console::set_display(console::user_input);
fflush(stdout);
}
}
if (n_past > 0 && is_interacting && !params.interactive) {
LOG("waiting for user input\n");
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix;
printf("%s", buffer.c_str());
}
std::string line;
bool another_line = true;
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// done taking input, reset color
console::set_display(console::reset);
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str());
}
LOG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
output_ss << llama_token_to_piece(ctx, token);
}
n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
} else {
LOG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
}
if (n_past > 0) {
if (is_interacting) {
llama_sampling_reset(ctx_sampling);
}
is_interacting = false;
}
}
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
break;
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
n_remain = params.n_predict;
is_interacting = true;
}
}
if (!params.interactive && n_remain <= 0) {
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
fflush(stdout);
}
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);
llama_sampling_free(ctx_sampling);
llama_backend_free();
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
return 0;
}

View file

@ -2,7 +2,7 @@
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer. This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc. The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
Step 1: Open jeopardy.sh and modify the following: Step 1: Open jeopardy.sh and modify the following:

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -19,7 +19,6 @@
#include "ggml.h" #include "ggml.h"
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "build-info.h"
#include "ggml-cuda.h" #include "ggml-cuda.h"
// utils // utils
@ -54,6 +53,13 @@ static std::vector<T> split(const std::string & str, char delim) {
return values; return values;
} }
template<typename T, typename F>
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
std::vector<std::string> str_values;
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
return str_values;
}
template<typename T> template<typename T>
static T avg(const std::vector<T> & v) { static T avg(const std::vector<T> & v) {
if (v.empty()) { if (v.empty()) {
@ -127,7 +133,8 @@ struct cmd_params {
std::vector<int> n_prompt; std::vector<int> n_prompt;
std::vector<int> n_gen; std::vector<int> n_gen;
std::vector<int> n_batch; std::vector<int> n_batch;
std::vector<bool> f32_kv; std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v;
std::vector<int> n_threads; std::vector<int> n_threads;
std::vector<int> n_gpu_layers; std::vector<int> n_gpu_layers;
std::vector<int> main_gpu; std::vector<int> main_gpu;
@ -143,7 +150,8 @@ static const cmd_params cmd_params_defaults = {
/* n_prompt */ {512}, /* n_prompt */ {512},
/* n_gen */ {128}, /* n_gen */ {128},
/* n_batch */ {512}, /* n_batch */ {512},
/* f32_kv */ {false}, /* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_num_physical_cores()}, /* n_threads */ {get_num_physical_cores()},
/* n_gpu_layers */ {99}, /* n_gpu_layers */ {99},
/* main_gpu */ {0}, /* main_gpu */ {0},
@ -163,7 +171,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@ -174,9 +183,32 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n"); printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
} }
static ggml_type ggml_type_from_name(const std::string & s) {
if (s == "f16") {
return GGML_TYPE_F16;
}
if (s == "q8_0") {
return GGML_TYPE_Q8_0;
}
if (s == "q4_0") {
return GGML_TYPE_Q4_0;
}
if (s == "q4_1") {
return GGML_TYPE_Q4_1;
}
if (s == "q5_0") {
return GGML_TYPE_Q5_0;
}
if (s == "q5_1") {
return GGML_TYPE_Q5_1;
}
return GGML_TYPE_COUNT;
}
static cmd_params parse_cmd_params(int argc, char ** argv) { static cmd_params parse_cmd_params(int argc, char ** argv) {
cmd_params params; cmd_params params;
std::string arg; std::string arg;
@ -225,13 +257,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "--memory-f32") { } else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<int>(argv[i], split_delim); auto p = split<std::string>(argv[i], split_delim);
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end()); std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
if (gt == GGML_TYPE_COUNT) {
invalid_param = true;
break;
}
types.push_back(gt);
}
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
} else if (arg == "-ctv" || arg == "--cache-type-v") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
if (gt == GGML_TYPE_COUNT) {
invalid_param = true;
break;
}
types.push_back(gt);
}
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
} else if (arg == "-t" || arg == "--threads") { } else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -322,7 +379,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@ -337,7 +395,8 @@ struct cmd_params_instance {
int n_prompt; int n_prompt;
int n_gen; int n_gen;
int n_batch; int n_batch;
bool f32_kv; ggml_type type_k;
ggml_type type_v;
int n_threads; int n_threads;
int n_gpu_layers; int n_gpu_layers;
int main_gpu; int main_gpu;
@ -366,7 +425,8 @@ struct cmd_params_instance {
cparams.n_ctx = n_prompt + n_gen; cparams.n_ctx = n_prompt + n_gen;
cparams.n_batch = n_batch; cparams.n_batch = n_batch;
cparams.f16_kv = !f32_kv; cparams.type_k = type_k;
cparams.type_v = type_v;
cparams.mul_mat_q = mul_mat_q; cparams.mul_mat_q = mul_mat_q;
return cparams; return cparams;
@ -381,7 +441,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
for (const auto & mg : params.main_gpu) for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split) for (const auto & ts : params.tensor_split)
for (const auto & nb : params.n_batch) for (const auto & nb : params.n_batch)
for (const auto & fk : params.f32_kv) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & mmq : params.mul_mat_q) for (const auto & mmq : params.mul_mat_q)
for (const auto & nt : params.n_threads) { for (const auto & nt : params.n_threads) {
cmd_params_instance instance = { cmd_params_instance instance = {
@ -389,7 +450,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
/* .n_prompt = */ n_prompt, /* .n_prompt = */ n_prompt,
/* .n_gen = */ n_gen, /* .n_gen = */ n_gen,
/* .n_batch = */ nb, /* .n_batch = */ nb,
/* .f32_kv = */ fk, /* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
@ -411,7 +473,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & mg : params.main_gpu) for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split) for (const auto & ts : params.tensor_split)
for (const auto & nb : params.n_batch) for (const auto & nb : params.n_batch)
for (const auto & fk : params.f32_kv) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & mmq : params.mul_mat_q) for (const auto & mmq : params.mul_mat_q)
for (const auto & nt : params.n_threads) { for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) { for (const auto & n_prompt : params.n_prompt) {
@ -423,7 +486,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_prompt = */ n_prompt, /* .n_prompt = */ n_prompt,
/* .n_gen = */ 0, /* .n_gen = */ 0,
/* .n_batch = */ nb, /* .n_batch = */ nb,
/* .f32_kv = */ fk, /* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
@ -442,7 +506,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_prompt = */ 0, /* .n_prompt = */ 0,
/* .n_gen = */ n_gen, /* .n_gen = */ n_gen,
/* .n_batch = */ nb, /* .n_batch = */ nb,
/* .f32_kv = */ fk, /* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
@ -490,7 +555,8 @@ struct test {
uint64_t model_n_params; uint64_t model_n_params;
int n_batch; int n_batch;
int n_threads; int n_threads;
bool f32_kv; ggml_type type_k;
ggml_type type_v;
int n_gpu_layers; int n_gpu_layers;
int main_gpu; int main_gpu;
bool mul_mat_q; bool mul_mat_q;
@ -509,7 +575,8 @@ struct test {
model_n_params = llama_model_n_params(lmodel); model_n_params = llama_model_n_params(lmodel);
n_batch = inst.n_batch; n_batch = inst.n_batch;
n_threads = inst.n_threads; n_threads = inst.n_threads;
f32_kv = inst.f32_kv; type_k = inst.type_k;
type_v = inst.type_v;
n_gpu_layers = inst.n_gpu_layers; n_gpu_layers = inst.n_gpu_layers;
main_gpu = inst.main_gpu; main_gpu = inst.main_gpu;
mul_mat_q = inst.mul_mat_q; mul_mat_q = inst.mul_mat_q;
@ -572,7 +639,7 @@ struct test {
"cuda", "opencl", "metal", "gpu_blas", "blas", "cuda", "opencl", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "f16_kv", "n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split", "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
@ -622,7 +689,7 @@ struct test {
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ns()), std::to_string(stdev_ns()),
@ -641,8 +708,8 @@ struct test {
} }
}; };
const std::string test::build_commit = BUILD_COMMIT; const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = BUILD_NUMBER; const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::metal = !!ggml_cpu_has_metal(); const bool test::metal = !!ggml_cpu_has_metal();
@ -806,8 +873,11 @@ struct markdown_printer : public printer {
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.push_back("n_batch"); fields.push_back("n_batch");
} }
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) { if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
fields.push_back("f16_kv"); fields.push_back("type_k");
}
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
fields.push_back("type_v");
} }
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.push_back("main_gpu"); fields.push_back("main_gpu");
@ -933,7 +1003,7 @@ struct sql_printer : public printer {
}; };
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx)); std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
int n_processed = 0; int n_processed = 0;
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);
@ -946,7 +1016,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
} }
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos(ctx); llama_token token = llama_token_bos(llama_get_model(ctx));
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);
@ -1037,7 +1107,7 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx); test t(inst, lmodel, ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
@ -1048,7 +1118,7 @@ int main(int argc, char ** argv) {
} }
for (int i = 0; i < params.reps; i++) { for (int i = 0; i < params.reps; i++) {
llama_kv_cache_tokens_rm(ctx, -1, -1); llama_kv_cache_clear(ctx);
uint64_t t_start = get_time_ns(); uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) { if (t.n_prompt > 0) {

1
examples/llama.swiftui/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
xcuserdata

View file

@ -0,0 +1,7 @@
# llama.swiftui
Local inference of llama.cpp on an iPhone.
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545

View file

@ -0,0 +1,208 @@
import Foundation
// import llama
enum LlamaError: Error {
case couldNotInitializeContext
}
actor LlamaContext {
private var model: OpaquePointer
private var context: OpaquePointer
private var batch: llama_batch
private var tokens_list: [llama_token]
/// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 512
var n_cur: Int32 = 0
var n_decode: Int32 = 0
init(model: OpaquePointer, context: OpaquePointer) {
self.model = model
self.context = context
self.tokens_list = []
self.batch = llama_batch_init(512, 0, 1)
self.temporary_invalid_cchars = []
}
deinit {
llama_free(context)
llama_free_model(model)
llama_backend_free()
}
static func createContext(path: String) throws -> LlamaContext {
llama_backend_init(false)
let model_params = llama_model_default_params()
let model = llama_load_model_from_file(path, model_params)
guard let model else {
print("Could not load model at \(path)")
throw LlamaError.couldNotInitializeContext
}
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 2048
ctx_params.n_threads = 8
ctx_params.n_threads_batch = 8
let context = llama_new_context_with_model(model, ctx_params)
guard let context else {
print("Could not load context!")
throw LlamaError.couldNotInitializeContext
}
return LlamaContext(model: model, context: context)
}
func get_n_tokens() -> Int32 {
return batch.n_tokens;
}
func completion_init(text: String) {
print("attempting to complete \"\(text)\"")
tokens_list = tokenize(text: text, add_bos: true)
temporary_invalid_cchars = []
let n_ctx = llama_n_ctx(context)
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
if n_kv_req > n_ctx {
print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
}
for id in tokens_list {
print(String(cString: token_to_piece(token: id) + [0]))
}
// batch = llama_batch_init(512, 0) // done in init()
batch.n_tokens = Int32(tokens_list.count)
for i1 in 0..<batch.n_tokens {
let i = Int(i1)
batch.token[i] = tokens_list[i]
batch.pos[i] = i1
batch.n_seq_id[Int(i)] = 1
batch.seq_id[Int(i)]![0] = 0
batch.logits[i] = 0
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
if llama_decode(context, batch) != 0 {
print("llama_decode() failed")
}
n_cur = batch.n_tokens
}
func completion_loop() -> String {
var new_token_id: llama_token = 0
let n_vocab = llama_n_vocab(model)
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
var candidates = Array<llama_token_data>()
candidates.reserveCapacity(Int(n_vocab))
for token_id in 0..<n_vocab {
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
}
candidates.withUnsafeMutableBufferPointer() { buffer in
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
new_token_id = llama_sample_token_greedy(context, &candidates_p)
}
if new_token_id == llama_token_eos(context) || n_cur == n_len {
print("\n")
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
return new_token_str
}
let new_token_cchars = token_to_piece(token: new_token_id)
temporary_invalid_cchars.append(contentsOf: new_token_cchars)
let new_token_str: String
if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
temporary_invalid_cchars.removeAll()
new_token_str = string
} else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
// in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
let string = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
new_token_str = string
} else {
new_token_str = ""
}
print(new_token_str)
// tokens_list.append(new_token_id)
batch.n_tokens = 0
batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.n_seq_id[Int(batch.n_tokens)] = 1
batch.seq_id[Int(batch.n_tokens)]![0] = 0
batch.logits[Int(batch.n_tokens)] = 1 // true
batch.n_tokens += 1
n_decode += 1
n_cur += 1
if llama_decode(context, batch) != 0 {
print("failed to evaluate llama!")
}
return new_token_str
}
func clear() {
tokens_list.removeAll()
temporary_invalid_cchars.removeAll()
}
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count
let n_tokens = utf8Count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
var swiftTokens: [llama_token] = []
for i in 0..<tokenCount {
swiftTokens.append(tokens[Int(i)])
}
tokens.deallocate()
return swiftTokens
}
/// - note: The result does not contain null-terminator
private func token_to_piece(token: llama_token) -> [CChar] {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
result.initialize(repeating: Int8(0), count: 8)
defer {
result.deallocate()
}
let nTokens = llama_token_to_piece(model, token, result, 8)
if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
defer {
newResult.deallocate()
}
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
return Array(bufferPointer)
}
}
}

View file

@ -0,0 +1,5 @@
//
// Use this file to import your target's public headers that you would like to expose to Swift.
//
#import "llama.h"

View file

@ -0,0 +1,481 @@
// !$*UTF8*$!
{
archiveVersion = 1;
classes = {
};
objectVersion = 56;
objects = {
/* Begin PBXBuildFile section */
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
8A1C83702AC328BD0096AF73 /* Frameworks */ = {
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXFrameworksBuildPhase section */
/* Begin PBXGroup section */
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
isa = PBXGroup;
children = (
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
542376092B0D9C40008E6A1C /* ggml-backend.h */,
542376062B0D9BEA008E6A1C /* ggml-quants.h */,
542376072B0D9BFB008E6A1C /* ggml-quants.c */,
549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
549479C62AC9E0F200E0F78B /* ggml-metal.h */,
549479C52AC9E0F200E0F78B /* ggml-metal.m */,
542EA09B2AC8723900A8AEE9 /* ggml.c */,
542EA09C2AC8723900A8AEE9 /* ggml.h */,
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
542EA0A12AC8729100A8AEE9 /* llama.cpp */,
542EA0A22AC8729100A8AEE9 /* llama.h */,
);
name = llama.cpp;
sourceTree = "<group>";
};
8A1C836A2AC328BD0096AF73 = {
isa = PBXGroup;
children = (
8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
8A907F312AC7134E006146EA /* llama.cpp.swift */,
8A3F84232AC4C891005E2EE8 /* models */,
8A1C83752AC328BD0096AF73 /* llama.swiftui */,
8A1C83742AC328BD0096AF73 /* Products */,
8A39BE082AC7601000BFEB40 /* Frameworks */,
);
sourceTree = "<group>";
};
8A1C83742AC328BD0096AF73 /* Products */ = {
isa = PBXGroup;
children = (
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
);
name = Products;
sourceTree = "<group>";
};
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
isa = PBXGroup;
children = (
8A3F84102AC4BD85005E2EE8 /* Resources */,
8A9F7C4B2AC332DC008AE1EA /* Models */,
8A9F7C4A2AC332BF008AE1EA /* UI */,
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
8A1C837C2AC328BE0096AF73 /* Preview Content */,
);
path = llama.swiftui;
sourceTree = "<group>";
};
8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
isa = PBXGroup;
children = (
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
);
path = "Preview Content";
sourceTree = "<group>";
};
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
isa = PBXGroup;
children = (
549479CA2AC9E16000E0F78B /* Metal.framework */,
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
);
name = Frameworks;
sourceTree = "<group>";
};
8A3F84102AC4BD85005E2EE8 /* Resources */ = {
isa = PBXGroup;
children = (
8A3F84112AC4BD8C005E2EE8 /* models */,
);
path = Resources;
sourceTree = "<group>";
};
8A3F84112AC4BD8C005E2EE8 /* models */ = {
isa = PBXGroup;
children = (
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
);
path = models;
sourceTree = "<group>";
};
8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
isa = PBXGroup;
children = (
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
8A907F322AC7134E006146EA /* LibLlama.swift */,
);
path = llama.cpp.swift;
sourceTree = "<group>";
};
8A9F7C4A2AC332BF008AE1EA /* UI */ = {
isa = PBXGroup;
children = (
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
);
path = UI;
sourceTree = "<group>";
};
8A9F7C4B2AC332DC008AE1EA /* Models */ = {
isa = PBXGroup;
children = (
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
);
path = Models;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
isa = PBXNativeTarget;
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
buildPhases = (
8A1C836F2AC328BD0096AF73 /* Sources */,
8A1C83702AC328BD0096AF73 /* Frameworks */,
8A1C83712AC328BD0096AF73 /* Resources */,
);
buildRules = (
);
dependencies = (
);
name = llama.swiftui;
packageProductDependencies = (
);
productName = llama.swiftui;
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
productType = "com.apple.product-type.application";
};
/* End PBXNativeTarget section */
/* Begin PBXProject section */
8A1C836B2AC328BD0096AF73 /* Project object */ = {
isa = PBXProject;
attributes = {
BuildIndependentTargetsInParallel = 1;
LastSwiftUpdateCheck = 1500;
LastUpgradeCheck = 1500;
TargetAttributes = {
8A1C83722AC328BD0096AF73 = {
CreatedOnToolsVersion = 15.0;
LastSwiftMigration = 1500;
};
};
};
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
compatibilityVersion = "Xcode 14.0";
developmentRegion = en;
hasScannedForEncodings = 0;
knownRegions = (
en,
Base,
);
mainGroup = 8A1C836A2AC328BD0096AF73;
packageReferences = (
);
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
projectDirPath = "";
projectRoot = "";
targets = (
8A1C83722AC328BD0096AF73 /* llama.swiftui */,
);
};
/* End PBXProject section */
/* Begin PBXResourcesBuildPhase section */
8A1C83712AC328BD0096AF73 /* Resources */ = {
isa = PBXResourcesBuildPhase;
buildActionMask = 2147483647;
files = (
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
8A3F84242AC4C891005E2EE8 /* models in Resources */,
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXResourcesBuildPhase section */
/* Begin PBXSourcesBuildPhase section */
8A1C836F2AC328BD0096AF73 /* Sources */ = {
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXSourcesBuildPhase section */
/* Begin XCBuildConfiguration section */
8A1C837F2AC328BE0096AF73 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_TESTABILITY = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu17;
GCC_DYNAMIC_NO_PIC = NO;
GCC_NO_COMMON_BLOCKS = YES;
GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
);
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
MTL_FAST_MATH = YES;
ONLY_ACTIVE_ARCH = YES;
SDKROOT = iphoneos;
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
};
name = Debug;
};
8A1C83802AC328BE0096AF73 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ALWAYS_SEARCH_USER_PATHS = NO;
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
CLANG_ANALYZER_NONNULL = YES;
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
CLANG_ENABLE_MODULES = YES;
CLANG_ENABLE_OBJC_ARC = YES;
CLANG_ENABLE_OBJC_WEAK = YES;
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
CLANG_WARN_BOOL_CONVERSION = YES;
CLANG_WARN_COMMA = YES;
CLANG_WARN_CONSTANT_CONVERSION = YES;
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INFINITE_RECURSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
CLANG_WARN_STRICT_PROTOTYPES = YES;
CLANG_WARN_SUSPICIOUS_MOVE = YES;
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COPY_PHASE_STRIP = NO;
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
ENABLE_NS_ASSERTIONS = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu17;
GCC_NO_COMMON_BLOCKS = YES;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
MTL_ENABLE_DEBUG_INFO = NO;
MTL_FAST_MATH = YES;
SDKROOT = iphoneos;
SWIFT_COMPILATION_MODE = wholemodule;
VALIDATE_PRODUCT = YES;
};
name = Release;
};
8A1C83822AC328BE0096AF73 /* Debug */ = {
isa = XCBuildConfiguration;
buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)",
"@executable_path/Frameworks",
);
MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2";
};
name = Debug;
};
8A1C83832AC328BE0096AF73 /* Release */ = {
isa = XCBuildConfiguration;
buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_STYLE = Automatic;
CURRENT_PROJECT_VERSION = 1;
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
DEVELOPMENT_TEAM = STLSG3FG8Q;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
LD_RUNPATH_SEARCH_PATHS = (
"$(inherited)",
"@executable_path/Frameworks",
);
MARKETING_VERSION = 1.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2";
};
name = Release;
};
/* End XCBuildConfiguration section */
/* Begin XCConfigurationList section */
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
isa = XCConfigurationList;
buildConfigurations = (
8A1C837F2AC328BE0096AF73 /* Debug */,
8A1C83802AC328BE0096AF73 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
isa = XCConfigurationList;
buildConfigurations = (
8A1C83822AC328BE0096AF73 /* Debug */,
8A1C83832AC328BE0096AF73 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
}

View file

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<Workspace
version = "1.0">
<FileRef
location = "self:">
</FileRef>
</Workspace>

View file

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>IDEDidComputeMac32BitWarning</key>
<true/>
</dict>
</plist>

View file

@ -0,0 +1,11 @@
{
"colors" : [
{
"idiom" : "universal"
}
],
"info" : {
"author" : "xcode",
"version" : 1
}
}

View file

@ -0,0 +1,13 @@
{
"images" : [
{
"idiom" : "universal",
"platform" : "ios",
"size" : "1024x1024"
}
],
"info" : {
"author" : "xcode",
"version" : 1
}
}

View file

@ -0,0 +1,6 @@
{
"info" : {
"author" : "xcode",
"version" : 1
}
}

View file

@ -0,0 +1,45 @@
import Foundation
@MainActor
class LlamaState: ObservableObject {
@Published var messageLog = ""
private var llamaContext: LlamaContext?
private var modelUrl: URL? {
Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
}
init() {
do {
try loadModel()
} catch {
messageLog += "Error!\n"
}
}
private func loadModel() throws {
messageLog += "Loading model...\n"
if let modelUrl {
llamaContext = try LlamaContext.createContext(path: modelUrl.path())
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
} else {
messageLog += "Could not locate model\n"
}
}
func complete(text: String) async {
guard let llamaContext else {
return
}
messageLog += "Attempting to complete text...\n"
await llamaContext.completion_init(text: text)
messageLog += "\(text)"
while await llamaContext.n_cur <= llamaContext.n_len {
let result = await llamaContext.completion_loop()
messageLog += "\(result)"
}
await llamaContext.clear()
messageLog += "\n\ndone\n"
}
}

View file

@ -0,0 +1,6 @@
{
"info" : {
"author" : "xcode",
"version" : 1
}
}

View file

@ -0,0 +1,42 @@
import SwiftUI
struct ContentView: View {
@StateObject var llamaState = LlamaState()
@State private var multiLineText = ""
var body: some View {
VStack {
ScrollView(.vertical) {
Text(llamaState.messageLog)
}
TextEditor(text: $multiLineText)
.frame(height: 200)
.padding()
.border(Color.gray, width: 0.5)
Button(action: {
sendText()
}) {
Text("Send")
.padding()
.background(Color.blue)
.foregroundColor(.white)
.cornerRadius(8)
}
}
.padding()
}
func sendText() {
Task {
await llamaState.complete(text: multiLineText)
multiLineText = ""
}
}
}
/*
#Preview {
ContentView()
}
*/

View file

@ -0,0 +1,10 @@
import SwiftUI
@main
struct llama_swiftuiApp: App {
var body: some Scene {
WindowGroup {
ContentView()
}
}
}

View file

@ -0,0 +1,36 @@
add_library(llava OBJECT
llava.cpp
llava.h
clip.cpp
clip.h
)
target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(llava PUBLIC .)
target_include_directories(llava PUBLIC ../..)
target_include_directories(llava PUBLIC ../../common)
target_compile_features(llava PRIVATE cxx_std_11)
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
if (BUILD_SHARED_LIBS)
set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS llava_shared LIBRARY)
endif()
if (NOT MSVC)
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
endif()
if(TARGET BUILD_INFO)
add_dependencies(llava BUILD_INFO)
endif()
set(TARGET llava-cli)
add_executable(llava-cli llava-cli.cpp)
install(TARGETS llava-cli RUNTIME)
target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(llava PRIVATE cxx_std_11)

Some files were not shown because too many files have changed in this diff Show more