Merge branch 'master' into finetune-lora

# Conflicts:
#	Makefile
#	examples/baby-llama/baby-llama.cpp
#	examples/train-text-from-scratch/train-text-from-scratch.cpp
#	llama.cpp
This commit is contained in:
xaedes 2023-09-16 20:31:58 +02:00
commit d3e06d3e73
No known key found for this signature in database
GPG key ID: 30030EDD817EA2B1
69 changed files with 4175 additions and 1655 deletions

22
.devops/cloud-v-pipeline Normal file
View file

@ -0,0 +1,22 @@
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
stage('Cleanup'){
cleanWs() // Cleaning previous CI build in workspace
}
stage('checkout repo'){
retry(5){ // Retry if the cloning fails due to some reason
checkout scm // Clone the repo on Runner
}
}
stage('Compiling llama.cpp'){
sh'''#!/bin/bash
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
'''
}
stage('Running llama.cpp'){
sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results
'''
}
}

View file

@ -27,7 +27,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -52,7 +52,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -87,7 +87,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -121,7 +121,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -149,7 +149,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -174,7 +174,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
- name: Dependencies - name: Dependencies
id: depends id: depends
@ -197,6 +197,62 @@ jobs:
cd build cd build
ctest --verbose --timeout 900 ctest --verbose --timeout 900
macOS-latest-cmake-ios:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release
macOS-latest-cmake-tvos:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=tvOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
cmake --build . --config Release
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest
@ -209,22 +265,24 @@ jobs:
matrix: matrix:
include: include:
- build: 'noavx' - build: 'noavx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2' - build: 'avx2'
defines: '-DLLAMA_BUILD_SERVER=ON' defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx' - build: 'avx'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512' - build: 'avx512'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast' - build: 'clblast'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas' - build: 'openblas'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Download OpenCL SDK - name: Download OpenCL SDK
id: get_opencl id: get_opencl
@ -334,27 +392,28 @@ jobs:
strategy: strategy:
matrix: matrix:
cuda: ['12.1.0', '11.7.1'] cuda: ['12.2.0', '11.7.1']
build: ['cublas'] build: ['cublas']
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
with:
fetch-depth: 0
- uses: Jimver/cuda-toolkit@v0.2.10 - uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit id: cuda-toolkit
with: with:
cuda: ${{ matrix.cuda }} cuda: ${{ matrix.cuda }}
# TODO(green-sky): _dev seems to fail, and non dev are not enought sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release cmake --build . --config Release
- name: Determine tag name - name: Determine tag name
@ -384,27 +443,11 @@ jobs:
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime - name: Copy and pack Cuda runtime
if: ${{ matrix.cuda == '12.1.0' }}
# TODO(green-sky): paths are cuda 12 specific
run: | run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
mkdir '.\build\bin\cudart\' $dst='.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\' robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\' 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
- name: Copy and pack Cuda runtime
if: ${{ matrix.cuda == '11.7.1' }}
# TODO(green-sky): paths are cuda 11 specific
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
mkdir '.\build\bin\cudart\'
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
- name: Upload Cuda runtime - name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -413,6 +456,22 @@ jobs:
path: | path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
freeBSD-latest:
runs-on: macos-12
steps:
- name: Clone
uses: actions/checkout@v3
- name: Build
uses: cross-platform-actions/action@v0.19.0
with:
operating_system: freebsd
version: '13.2'
run: |
sudo pkg update
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
release: release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -429,7 +488,9 @@ jobs:
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Determine tag name - name: Determine tag name
id: tag id: tag
@ -487,7 +548,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v1 # uses: actions/checkout@v3
# #
# - name: Dependencies # - name: Dependencies
# run: | # run: |
@ -511,7 +572,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v1 # uses: actions/checkout@v3
# #
# - name: Dependencies # - name: Dependencies
# run: | # run: |
@ -535,7 +596,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v1 # uses: actions/checkout@v3
# #
# - name: Dependencies # - name: Dependencies
# run: | # run: |
@ -565,7 +626,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v1 # uses: actions/checkout@v3
# #
# - name: Add msbuild to PATH # - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1 # uses: microsoft/setup-msbuild@v1
@ -604,7 +665,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v1 # uses: actions/checkout@v3
# #
# - name: Add msbuild to PATH # - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1 # uses: microsoft/setup-msbuild@v1
@ -650,7 +711,7 @@ jobs:
# #
# steps: # steps:
# - name: Clone # - name: Clone
# uses: actions/checkout@v1 # uses: actions/checkout@v3
# #
# - name: Dependencies # - name: Dependencies
# run: | # run: |

View file

@ -26,8 +26,15 @@ jobs:
strategy: strategy:
matrix: matrix:
config: config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile" } - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
steps: steps:
- name: Check out the repo - name: Check out the repo
uses: actions/checkout@v3 uses: actions/checkout@v3
@ -51,7 +58,7 @@ jobs:
with: with:
context: . context: .
push: true push: true
platforms: linux/amd64,linux/arm64 platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }} file: ${{ matrix.config.dockerfile }}
@ -60,6 +67,6 @@ jobs:
with: with:
context: . context: .
push: ${{ github.event_name == 'push' }} push: ${{ github.event_name == 'push' }}
platforms: linux/amd64,linux/arm64 platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }} file: ${{ matrix.config.dockerfile }}

View file

@ -24,7 +24,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:

View file

@ -135,6 +135,7 @@ set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON) set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
include(CheckCXXCompilerFlag)
if (NOT MSVC) if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD) if (LLAMA_SANITIZE_THREAD)
@ -171,8 +172,8 @@ if (LLAMA_METAL)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found") message(STATUS "Metal framework found")
set(GGML_HEADERS_METAL ggml-metal.h)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) set(GGML_SOURCES_METAL ggml-metal.m)
add_compile_definitions(GGML_USE_METAL) add_compile_definitions(GGML_USE_METAL)
if (LLAMA_METAL_NDEBUG) if (LLAMA_METAL_NDEBUG)
@ -191,7 +192,6 @@ if (LLAMA_METAL)
${METALKIT_FRAMEWORK} ${METALKIT_FRAMEWORK}
) )
endif() endif()
if (LLAMA_BLAS) if (LLAMA_BLAS)
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(BLA_STATIC ON) set(BLA_STATIC ON)
@ -268,7 +268,8 @@ if (LLAMA_BLAS)
endif() endif()
if (LLAMA_K_QUANTS) if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) set(GGML_HEADERS_EXTRA k_quants.h)
set(GGML_SOURCES_EXTRA k_quants.c)
add_compile_definitions(GGML_USE_K_QUANTS) add_compile_definitions(GGML_USE_K_QUANTS)
if (LLAMA_QKK_64) if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64) add_compile_definitions(GGML_QKK_64)
@ -284,7 +285,8 @@ if (LLAMA_CUBLAS)
enable_language(CUDA) enable_language(CUDA)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) set(GGML_HEADERS_CUDA ggml-cuda.h)
set(GGML_SOURCES_CUDA ggml-cuda.cu)
add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS)
# if (LLAMA_CUDA_CUBLAS) # if (LLAMA_CUDA_CUBLAS)
@ -332,6 +334,7 @@ if (LLAMA_MPI)
find_package(MPI) find_package(MPI)
if (MPI_C_FOUND) if (MPI_C_FOUND)
message(STATUS "MPI found") message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI) add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
@ -354,7 +357,8 @@ if (LLAMA_CLBLAST)
if (CLBlast_FOUND) if (CLBlast_FOUND)
message(STATUS "CLBlast found") message(STATUS "CLBlast found")
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h) set(GGML_HEADERS_OPENCL ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
add_compile_definitions(GGML_USE_CLBLAST) add_compile_definitions(GGML_USE_CLBLAST)
@ -382,13 +386,15 @@ if (LLAMA_HIPBLAS)
message(STATUS "HIP and hipBLAS found") message(STATUS "HIP and hipBLAS found")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
endif() endif()
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
@ -421,6 +427,7 @@ if (LLAMA_ALL_WARNINGS)
-Wextra -Wextra
-Wpedantic -Wpedantic
-Wcast-qual -Wcast-qual
-Wmissing-declarations
-Wno-unused-function -Wno-unused-function
-Wno-multichar -Wno-multichar
) )
@ -439,7 +446,7 @@ if (LLAMA_ALL_WARNINGS)
endif() endif()
if (MSVC) if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS) add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
@ -461,6 +468,13 @@ endif()
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC)
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else ()
set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()
if (NOT MSVC) if (NOT MSVC)
if (LLAMA_STATIC) if (LLAMA_STATIC)
add_link_options(-static) add_link_options(-static)
@ -476,25 +490,33 @@ if (NOT MSVC)
endif() endif()
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected") message(STATUS "ARM detected")
if (MSVC) if (MSVC)
# TODO: arm msvc? add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
else() else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
add_compile_options(-mfp16-format=ieee)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero # Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access) add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2 # Raspberry Pi 2
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations) add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif() endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit) # Raspberry Pi 3, 4, Zero 2 (32-bit)
add_compile_options(-mfp16-format=ieee -mno-unaligned-access) add_compile_options(-mno-unaligned-access)
endif() endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
message(STATUS "x86 detected") message(STATUS "x86 detected")
if (MSVC) if (MSVC)
if (LLAMA_AVX512) if (LLAMA_AVX512)
@ -578,10 +600,12 @@ endif()
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions # and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary # similarly on DragonFly, enabling BSD extensions is necessary
if (CMAKE_SYSTEM_NAME MATCHES "Darwin") if (
add_compile_definitions(_DARWIN_C_SOURCE) CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
endif() CMAKE_SYSTEM_NAME MATCHES "iOS" OR
if (CMAKE_SYSTEM_NAME MATCHES "DragonFly") CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
)
add_compile_definitions(_DARWIN_C_SOURCE) add_compile_definitions(_DARWIN_C_SOURCE)
endif() endif()
@ -614,11 +638,11 @@ add_library(ggml OBJECT
ggml.h ggml.h
ggml-alloc.c ggml-alloc.c
ggml-alloc.h ggml-alloc.h
${GGML_SOURCES_CUDA} ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
) )
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
@ -656,14 +680,53 @@ if (BUILD_SHARED_LIBS)
if (LLAMA_METAL) if (LLAMA_METAL)
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
endif() endif()
install(TARGETS llama LIBRARY)
endif() endif()
# #
# install # install
# #
include(GNUInstallDirs) include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
CACHE PATH "Location of header files")
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
CACHE PATH "Location of binary files")
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
LLAMA_LIB_INSTALL_DIR
LLAMA_BIN_INSTALL_DIR )
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
VERSION ${LLAMA_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
install(TARGETS llama LIBRARY PUBLIC_HEADER)
install( install(
FILES convert.py FILES convert.py
PERMISSIONS PERMISSIONS

108
Makefile
View file

@ -2,7 +2,7 @@
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative finetune tests/test-c.o BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative finetune tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
# Code coverage output files # Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -49,7 +49,7 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
continue; \ continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \ continue; \
else \ else \
echo "Running test $$test_target..."; \ echo "Running test $$test_target..."; \
@ -95,65 +95,60 @@ CXXV := $(shell $(CXX) --version | head -n 1)
# #
# keep standard at C11 and C++11 # keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
# -Ofast tends to produce faster code, but may not be available for some compilers. # -Ofast tends to produce faster code, but may not be available for some compilers.
ifdef LLAMA_FAST ifdef LLAMA_FAST
OPT = -Ofast MK_CFLAGS += -Ofast
MK_HOST_CXXFLAGS += -Ofast
MK_CUDA_CXXFLAGS += -O3
else else
OPT = -O3 MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
endif endif
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = $(OPT) -std=c11 -fPIC
MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
MK_LDFLAGS =
# clock_gettime came in POSIX.1b (1993) # clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3 # posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) # M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
MK_CFLAGS += -D_XOPEN_SOURCE=600 MK_CPPFLAGS += -D_XOPEN_SOURCE=600
MK_CXXFLAGS += -D_XOPEN_SOURCE=600
# Somehow in OpenBSD whenever POSIX conformance is specified # Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability, # some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher # which was introduced in POSIX.1-2008, forcing us to go higher
ifeq ($(UNAME_S),OpenBSD) ifeq ($(UNAME_S),OpenBSD)
MK_CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700 MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
endif endif
# Data types, macros and functions related to controlling CPU affinity and # Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc # some memory allocation are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux) ifeq ($(UNAME_S),Linux)
MK_CFLAGS += -D_GNU_SOURCE MK_CPPFLAGS += -D_GNU_SOURCE
MK_CXXFLAGS += -D_GNU_SOURCE
endif endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions # and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary # similarly on DragonFly, enabling BSD extensions is necessary
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
MK_CFLAGS += -D_DARWIN_C_SOURCE MK_CPPFLAGS += -D_DARWIN_C_SOURCE
MK_CXXFLAGS += -D_DARWIN_C_SOURCE
endif endif
ifeq ($(UNAME_S),DragonFly) ifeq ($(UNAME_S),DragonFly)
MK_CFLAGS += -D__BSD_VISIBLE MK_CPPFLAGS += -D__BSD_VISIBLE
MK_CXXFLAGS += -D__BSD_VISIBLE
endif endif
# alloca is a non-standard interface that is not visible on BSDs when # alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way # POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases # to enable it in such cases
ifeq ($(UNAME_S),FreeBSD) ifeq ($(UNAME_S),FreeBSD)
MK_CFLAGS += -D__BSD_VISIBLE MK_CPPFLAGS += -D__BSD_VISIBLE
MK_CXXFLAGS += -D__BSD_VISIBLE
endif endif
ifeq ($(UNAME_S),NetBSD) ifeq ($(UNAME_S),NetBSD)
MK_CFLAGS += -D_NETBSD_SOURCE MK_CPPFLAGS += -D_NETBSD_SOURCE
MK_CXXFLAGS += -D_NETBSD_SOURCE
endif endif
ifeq ($(UNAME_S),OpenBSD) ifeq ($(UNAME_S),OpenBSD)
MK_CFLAGS += -D_BSD_SOURCE MK_CPPFLAGS += -D_BSD_SOURCE
MK_CXXFLAGS += -D_BSD_SOURCE
endif endif
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
@ -180,9 +175,16 @@ endif # LLAMA_DISABLE_LOGS
# warnings # warnings
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang++,$(CXX))' # TODO(cebtenzzre): remove this once PR #2632 gets merged
TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations
ifneq '' '$(findstring clang,$(shell $(CXX) --version))'
# clang++ only
MK_CXXFLAGS += -Wmissing-prototypes
TTFS_CXXFLAGS += -Wno-missing-prototypes
else
# g++ only # g++ only
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
endif endif
@ -233,7 +235,7 @@ ifndef RISCV
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
MK_CFLAGS += -march=native -mtune=native MK_CFLAGS += -march=native -mtune=native
MK_CXXFLAGS += -march=native -mtune=native MK_HOST_CXXFLAGS += -march=native -mtune=native
# Usage AVX-only # Usage AVX-only
#MK_CFLAGS += -mfma -mf16c -mavx #MK_CFLAGS += -mfma -mf16c -mavx
@ -373,7 +375,7 @@ ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
@ -408,7 +410,6 @@ ifdef LLAMA_HIPBLAS
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
HIPFLAGS += -DCC_TURING=1000000000
ifdef LLAMA_CUDA_FORCE_DMMV ifdef LLAMA_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV
@ -442,23 +443,30 @@ k_quants.o: k_quants.c k_quants.h
endif # LLAMA_NO_K_QUANTS endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides # combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# save CXXFLAGS before we add host-only options
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
override CXXFLAGS += $(HOST_CXXFLAGS)
# #
# Print build information # Print build information
# #
$(info I llama.cpp build info: ) $(info I llama.cpp build info: )
$(info I UNAME_S: $(UNAME_S)) $(info I UNAME_S: $(UNAME_S))
$(info I UNAME_P: $(UNAME_P)) $(info I UNAME_P: $(UNAME_P))
$(info I UNAME_M: $(UNAME_M)) $(info I UNAME_M: $(UNAME_M))
$(info I CFLAGS: $(CFLAGS)) $(info I CFLAGS: $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS)) $(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS: $(LDFLAGS)) $(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I CC: $(CCV)) $(info I LDFLAGS: $(LDFLAGS))
$(info I CXX: $(CXXV)) $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info ) $(info )
# #
@ -504,22 +512,22 @@ main: examples/main/main.cpp build-info.h ggml.
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@echo @echo
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) simple: examples/simple/simple.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
@ -536,7 +544,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@ -562,7 +570,7 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
endif endif
build-info.h: $(wildcard .git/index) scripts/build-info.sh build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp @sh scripts/build-info.sh $(CC) > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \ mv $@.tmp $@; \
else \ else \
@ -575,7 +583,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
tests: $(TEST_TARGETS) tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@ ./$@
@ -612,7 +620,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h tests/test-c.o: tests/test-c.c llama.h

View file

@ -844,8 +844,17 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
#### Images #### Images
We have two Docker images available for this project: We have two Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
#### Usage #### Usage

View file

@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
} }
void process_escapes(std::string& input) { static void process_escapes(std::string& input) {
std::size_t input_len = input.length(); std::size_t input_len = input.length();
std::size_t output_idx = 0; std::size_t output_idx = 0;
@ -386,6 +386,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
#else #else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers_draft = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif #endif
} else if (arg == "--main-gpu" || arg == "-mg") { } else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) { if (++i >= argc) {
@ -435,8 +446,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") { } else if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--numa") { } else if (arg == "--numa") {
params.numa = true; params.numa = true;
} else if (arg == "--export") { } else if (arg == "--export") {
@ -676,6 +685,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
printf(" -ngld N, --n-gpu-layers-draft N\n");
printf(" number of layers to store in VRAM for the draft model\n");
printf(" -ts SPLIT --tensor-split SPLIT\n"); printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
@ -686,7 +697,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" Not recommended since this is both slower and uses more VRAM.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#endif #endif
printf(" --mtest compute maximum memory usage\n");
printf(" --export export the computation graph to 'llama.ggml'\n"); printf(" --export export the computation graph to 'llama.ggml'\n");
printf(" --verbose-prompt print prompt before generation\n"); printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
@ -809,10 +819,10 @@ std::vector<llama_token> llama_tokenize(
// upper limit for the number of tokens // upper limit for the number of tokens
int n_tokens = text.length() + add_bos; int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens); std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
if (n_tokens < 0) { if (n_tokens < 0) {
result.resize(-n_tokens); result.resize(-n_tokens);
int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens); GGML_ASSERT(check == -n_tokens);
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);
@ -1243,7 +1253,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);

View file

@ -3,6 +3,7 @@
#pragma once #pragma once
#include "llama.h" #include "llama.h"
#include "build-info.h"
#define LOG_NO_FILE_LINE_FUNCTION #define LOG_NO_FILE_LINE_FUNCTION
#include "log.h" #include "log.h"
@ -20,8 +21,13 @@
#define DIRECTORY_SEPARATOR '/' #define DIRECTORY_SEPARATOR '/'
#endif // _WIN32 #endif // _WIN32
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", ##__VA_ARGS__); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
} while(0)
// //
// CLI argument parsing // CLI argument parsing
@ -38,6 +44,7 @@ struct gpt_params {
int32_t n_draft = 16; // number of tokens to draft during speculative decoding int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
@ -109,7 +116,6 @@ struct gpt_params {
bool perplexity = false; // compute perplexity over the prompt bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool numa = false; // attempt optimizations that help on some NUMA systems bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation

View file

@ -158,7 +158,7 @@ namespace console {
} }
} }
char32_t getchar32() { static char32_t getchar32() {
#if defined(_WIN32) #if defined(_WIN32)
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
wchar_t high_surrogate = 0; wchar_t high_surrogate = 0;
@ -212,7 +212,7 @@ namespace console {
#endif #endif
} }
void pop_cursor() { static void pop_cursor() {
#if defined(_WIN32) #if defined(_WIN32)
if (hConsole != NULL) { if (hConsole != NULL) {
CONSOLE_SCREEN_BUFFER_INFO bufferInfo; CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
@ -233,7 +233,7 @@ namespace console {
putc('\b', out); putc('\b', out);
} }
int estimateWidth(char32_t codepoint) { static int estimateWidth(char32_t codepoint) {
#if defined(_WIN32) #if defined(_WIN32)
(void)codepoint; (void)codepoint;
return 1; return 1;
@ -242,7 +242,7 @@ namespace console {
#endif #endif
} }
int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
#if defined(_WIN32) #if defined(_WIN32)
CONSOLE_SCREEN_BUFFER_INFO bufferInfo; CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) { if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
@ -303,7 +303,7 @@ namespace console {
#endif #endif
} }
void replace_last(char ch) { static void replace_last(char ch) {
#if defined(_WIN32) #if defined(_WIN32)
pop_cursor(); pop_cursor();
put_codepoint(&ch, 1, 1); put_codepoint(&ch, 1, 1);
@ -312,7 +312,7 @@ namespace console {
#endif #endif
} }
void append_utf8(char32_t ch, std::string & out) { static void append_utf8(char32_t ch, std::string & out) {
if (ch <= 0x7F) { if (ch <= 0x7F) {
out.push_back(static_cast<unsigned char>(ch)); out.push_back(static_cast<unsigned char>(ch));
} else if (ch <= 0x7FF) { } else if (ch <= 0x7FF) {
@ -333,7 +333,7 @@ namespace console {
} }
// Helper function to remove the last UTF-8 character from a string // Helper function to remove the last UTF-8 character from a string
void pop_back_utf8_char(std::string & line) { static void pop_back_utf8_char(std::string & line) {
if (line.empty()) { if (line.empty()) {
return; return;
} }
@ -349,7 +349,7 @@ namespace console {
line.erase(pos); line.erase(pos);
} }
bool readline_advanced(std::string & line, bool multiline_input) { static bool readline_advanced(std::string & line, bool multiline_input) {
if (out != stdout) { if (out != stdout) {
fflush(stdout); fflush(stdout);
} }
@ -452,7 +452,7 @@ namespace console {
return has_more; return has_more;
} }
bool readline_simple(std::string & line, bool multiline_input) { static bool readline_simple(std::string & line, bool multiline_input) {
#if defined(_WIN32) #if defined(_WIN32)
std::wstring wline; std::wstring wline;
if (!std::getline(std::wcin, wline)) { if (!std::getline(std::wcin, wline)) {

View file

@ -9,7 +9,7 @@
namespace grammar_parser { namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun) // NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp // copied from llama.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) { static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src); uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4; uint8_t highbits = first_byte >> 4;
@ -24,19 +24,19 @@ namespace grammar_parser {
return std::make_pair(value, pos); return std::make_pair(value, pos);
} }
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second; return result.first->second;
} }
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id; return next_id;
} }
void add_rule( static void add_rule(
parse_state & state, parse_state & state,
uint32_t rule_id, uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) { const std::vector<llama_grammar_element> & rule) {
@ -46,11 +46,11 @@ namespace grammar_parser {
state.rules[rule_id] = rule; state.rules[rule_id] = rule;
} }
bool is_word_char(char c) { static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
} }
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) { static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src; const char * pos = src;
const char * end = src + size; const char * end = src + size;
uint32_t value = 0; uint32_t value = 0;
@ -73,7 +73,7 @@ namespace grammar_parser {
return std::make_pair(value, pos); return std::make_pair(value, pos);
} }
const char * parse_space(const char * src, bool newline_ok) { static const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src; const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' || while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) { (newline_ok && (*pos == '\r' || *pos == '\n'))) {
@ -88,7 +88,7 @@ namespace grammar_parser {
return pos; return pos;
} }
const char * parse_name(const char * src) { static const char * parse_name(const char * src) {
const char * pos = src; const char * pos = src;
while (is_word_char(*pos)) { while (is_word_char(*pos)) {
pos++; pos++;
@ -99,7 +99,7 @@ namespace grammar_parser {
return pos; return pos;
} }
std::pair<uint32_t, const char *> parse_char(const char * src) { static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') { if (*src == '\\') {
switch (src[1]) { switch (src[1]) {
case 'x': return parse_hex(src + 2, 2); case 'x': return parse_hex(src + 2, 2);
@ -129,7 +129,7 @@ namespace grammar_parser {
uint32_t rule_id, uint32_t rule_id,
bool is_nested); bool is_nested);
const char * parse_sequence( static const char * parse_sequence(
parse_state & state, parse_state & state,
const char * src, const char * src,
const std::string & rule_name, const std::string & rule_name,
@ -247,7 +247,7 @@ namespace grammar_parser {
return pos; return pos;
} }
const char * parse_rule(parse_state & state, const char * src) { static const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src); const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false); const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src; size_t name_len = name_end - src;
@ -285,7 +285,7 @@ namespace grammar_parser {
} }
} }
void print_grammar_char(FILE * file, uint32_t c) { static void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) { if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c)); fprintf(file, "%c", static_cast<char>(c));
} else { } else {
@ -294,7 +294,7 @@ namespace grammar_parser {
} }
} }
bool is_char_element(llama_grammar_element elem) { static bool is_char_element(llama_grammar_element elem) {
switch (elem.type) { switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true; case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true; case LLAMA_GRETYPE_CHAR_NOT: return true;
@ -304,7 +304,7 @@ namespace grammar_parser {
} }
} }
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) { static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) { for (auto elem : rule) {
switch (elem.type) { switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break; case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
@ -334,7 +334,7 @@ namespace grammar_parser {
fprintf(file, "\n"); fprintf(file, "\n");
} }
void print_rule( static void print_rule(
FILE * file, FILE * file,
uint32_t rule_id, uint32_t rule_id,
const std::vector<llama_grammar_element> & rule, const std::vector<llama_grammar_element> & rule,

304
convert-baichuan-hf-to-gguf.py Executable file
View file

@ -0,0 +1,304 @@
#!/usr/bin/env python3
# HF baichuan --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import itertools
import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if TYPE_CHECKING:
from typing import TypeAlias
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
# reverse HF permute back to original pth layout
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
r = weights.shape[0] // 3
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
r = weights.shape[0] // 3
return weights[r * n_part : r * n_part + r, ...]
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
print("hello print: ",hparams["architectures"][0])
if hparams["architectures"][0] != "BaichuanForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
elif "model_max_length" in hparams:
ctx_length = hparams["model_max_length"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
tmp=model_part
for i in range(block_count):
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -55,10 +55,22 @@ def count_model_parts(dir_model: Path) -> int:
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file") parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument(
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") "--vocab-only", action="store_true",
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") help="extract only the vocab",
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) )
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args() return parser.parse_args()
args = parse_args() args = parse_args()
@ -137,7 +149,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
print("gguf: get gpt2 tokenizer vocab") print("gguf: get gpt2 tokenizer vocab")
vocab_size = len(tokenizer_json["model"]["vocab"]) # The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model) tokenizer = AutoTokenizer.from_pretrained(dir_model)

View file

@ -56,10 +56,22 @@ def count_model_parts(dir_model: Path) -> int:
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file") parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument(
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") "--vocab-only", action="store_true",
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") help="extract only the vocab",
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) )
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args() return parser.parse_args()
args = parse_args() args = parse_args()

248
convert-starcoder-hf-to-gguf.py Executable file
View file

@ -0,0 +1,248 @@
#!/usr/bin/env python3
# HF starcoder --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def bytes_to_unicode():
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
return dict(zip(bs, (chr(n) for n in cs)))
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.STARCODER
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("StarCoder")
gguf_writer.add_context_length(hparams["n_positions"])
gguf_writer.add_embedding_length(hparams["n_embd"])
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
sys.exit(1)
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
print("gguf: get gpt2 tokenizer vocab")
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
for i in range(vocab_size):
if i in reverse_vocab:
try:
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
except KeyError:
text = bytearray()
for c in reverse_vocab[i]:
if ord(c) < 256: # single byte character
text.append(byte_decoder[ord(c)])
else: # multibyte special token character
text.extend(c.encode('utf-8'))
else:
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["n_embd"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -145,7 +145,6 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
class Params: class Params:
n_vocab: int n_vocab: int
n_embd: int n_embd: int
n_mult: int
n_layer: int n_layer: int
n_ctx: int n_ctx: int
n_ff: int n_ff: int
@ -161,15 +160,6 @@ class Params:
# path to the directory containing the model files # path to the directory containing the model files
path_model: Path | None = None path_model: Path | None = None
@staticmethod
def find_n_mult(n_ff: int, n_embd: int) -> int:
# hardcoded magic range
for n_mult in range(8192, 1, -1):
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
@staticmethod @staticmethod
def guessed(model: LazyModel) -> Params: def guessed(model: LazyModel) -> Params:
# try transformer naming first # try transformer naming first
@ -197,7 +187,6 @@ class Params:
return Params( return Params(
n_vocab = n_vocab, n_vocab = n_vocab,
n_embd = n_embd, n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer, n_layer = n_layer,
n_ctx = -1, n_ctx = -1,
n_ff = n_ff, n_ff = n_ff,
@ -225,8 +214,6 @@ class Params:
else: else:
f_rope_scale = None f_rope_scale = None
n_mult = Params.find_n_mult(n_ff, n_embd)
if "max_sequence_length" in config: if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"] n_ctx = config["max_sequence_length"]
elif "max_position_embeddings" in config: elif "max_position_embeddings" in config:
@ -238,7 +225,6 @@ class Params:
return Params( return Params(
n_vocab = n_vocab, n_vocab = n_vocab,
n_embd = n_embd, n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer, n_layer = n_layer,
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = n_ff,
@ -250,7 +236,7 @@ class Params:
) )
# LLaMA v2 70B params.json # LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1 # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
@staticmethod @staticmethod
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path)) config = json.load(open(config_path))
@ -258,7 +244,6 @@ class Params:
n_vocab = config["vocab_size"] if "vocab_size" in config else -1 n_vocab = config["vocab_size"] if "vocab_size" in config else -1
n_embd = config["dim"] n_embd = config["dim"]
n_layer = config["n_layers"] n_layer = config["n_layers"]
n_mult = config["multiple_of"]
n_ff = -1 n_ff = -1
n_head = config["n_heads"] n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
@ -285,7 +270,6 @@ class Params:
return Params( return Params(
n_vocab = n_vocab, n_vocab = n_vocab,
n_embd = n_embd, n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer, n_layer = n_layer,
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = n_ff,

View file

@ -10,12 +10,12 @@
#endif #endif
#ifdef LLAMA_DEFAULT_RMS_EPS #ifdef LLAMA_DEFAULT_RMS_EPS
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
#else #else
static const float rms_norm_eps = 5e-6f; constexpr float rms_norm_eps = 5e-6f;
#endif #endif
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) { if (plan.work_size > 0) {
@ -26,13 +26,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
ggml_graph_compute(graph, &plan); ggml_graph_compute(graph, &plan);
} }
struct ggml_tensor * randomize_tensor( static struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor, struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
int ndims, ) {
const int64_t ne[],
float fmin,
float fmax) {
switch (ndims) { switch (ndims) {
case 1: case 1:
for (int i0 = 0; i0 < ne[0]; i0++) { for (int i0 = 0; i0 < ne[0]; i0++) {
@ -87,7 +83,7 @@ struct llama_hparams {
} }
}; };
uint32_t get_n_ff(const struct llama_hparams* hparams) { static uint32_t get_n_ff(const struct llama_hparams* hparams) {
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
return n_ff; return n_ff;
} }
@ -188,7 +184,7 @@ struct llama_model_lora {
std::vector<llama_layer_lora> layers; std::vector<llama_layer_lora> layers;
}; };
void init_model(struct llama_model * model) { static void init_model(struct llama_model * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -225,7 +221,7 @@ void init_model(struct llama_model * model) {
} }
void init_model_lora(struct llama_model_lora * model) { static void init_model_lora(struct llama_model_lora * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -268,7 +264,7 @@ void init_model_lora(struct llama_model_lora * model) {
} }
} }
void set_param_model(struct llama_model * model) { static void set_param_model(struct llama_model * model) {
const auto& hparams = model->hparams; const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -294,7 +290,7 @@ void set_param_model(struct llama_model * model) {
} }
} }
void set_param_model_lora(struct llama_model_lora * model) { static void set_param_model_lora(struct llama_model_lora * model) {
const auto& hparams = model->hparams; const auto& hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -325,7 +321,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
} }
} }
void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -356,7 +352,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
} }
void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) { static void randomize_model_lora(
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
@ -391,7 +389,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
} }
bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -427,7 +425,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
return true; return true;
} }
bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -463,15 +461,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
return true; return true;
} }
struct ggml_tensor * forward( static struct ggml_tensor * forward(
struct llama_model * model, struct llama_model * model,
struct llama_kv_cache * cache, struct llama_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
const int n_tokens, const int n_tokens,
const int n_past) { const int n_past
) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -688,16 +686,16 @@ struct ggml_tensor * forward(
return inpL; return inpL;
} }
struct ggml_tensor * forward_batch( static struct ggml_tensor * forward_batch(
struct llama_model * model, struct llama_model * model,
struct llama_kv_cache * cache, struct llama_kv_cache * cache,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_cgraph * gf, struct ggml_cgraph * gf,
struct ggml_tensor * tokens_input, struct ggml_tensor * tokens_input,
const int n_tokens, const int n_tokens,
const int n_past, const int n_past,
const int n_batch) { const int n_batch
) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -979,16 +977,15 @@ struct ggml_tensor * forward_batch(
return inpL; return inpL;
} }
static struct ggml_tensor * forward_lora(
struct ggml_tensor * forward_lora( struct llama_model_lora * model,
struct llama_model_lora * model, struct llama_kv_cache * cache,
struct llama_kv_cache * cache, struct ggml_context * ctx0,
struct ggml_context * ctx0, struct ggml_cgraph * gf,
struct ggml_cgraph * gf, struct ggml_tensor * tokens_input,
struct ggml_tensor * tokens_input, const int n_tokens,
const int n_tokens, const int n_past
const int n_past) { ) {
const int N = n_tokens; const int N = n_tokens;
struct llama_kv_cache& kv_self = *cache; struct llama_kv_cache& kv_self = *cache;
@ -1234,7 +1231,7 @@ struct ggml_tensor * forward_lora(
return inpL; return inpL;
} }
void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
assert(logits->n_dims == 2); assert(logits->n_dims == 2);
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
assert(best_samples->n_dims == 1); assert(best_samples->n_dims == 1);
@ -1265,7 +1262,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
} }
} }
void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { static void sample_softmax_batch(
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
struct ggml_tensor * best_samples
) {
GGML_ASSERT(best_samples->n_dims == 2); GGML_ASSERT(best_samples->n_dims == 2);
GGML_ASSERT(logits->n_dims == 3); GGML_ASSERT(logits->n_dims == 3);
GGML_ASSERT(probs->n_dims == 3); GGML_ASSERT(probs->n_dims == 3);
@ -1299,7 +1299,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
} }
} }
void print_row(struct ggml_tensor * probs, int i) { static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
printf(" %.2f", p); printf(" %.2f", p);
@ -1307,7 +1307,7 @@ void print_row(struct ggml_tensor * probs, int i) {
printf("\n"); printf("\n");
} }
void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
@ -1318,7 +1318,7 @@ void print_matrix(struct ggml_tensor * probs) {
} }
} }
void print_token(int token, int n_vocab) { static void print_token(int token, int n_vocab) {
for (int k = 0; k < token; ++k) { for (int k = 0; k < token; ++k) {
printf(" "); printf(" ");
} }
@ -1329,14 +1329,14 @@ void print_token(int token, int n_vocab) {
printf("\n"); printf("\n");
} }
void print_tokens(struct ggml_tensor * tokens, int n_vocab) { static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
for (int i=0; i<tokens->ne[0]; ++i) { for (int i=0; i<tokens->ne[0]; ++i) {
int token = ggml_get_i32_1d(tokens, i); int token = ggml_get_i32_1d(tokens, i);
print_token(token, n_vocab); print_token(token, n_vocab);
} }
} }
void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_vocab = targets->ne[0]; int n_vocab = targets->ne[0];
float randomness = 0.0f; float randomness = 0.0f;
@ -1357,7 +1357,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
} }
} }
void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { static void get_example_targets_batch(
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
) {
GGML_ASSERT(tokens_input->n_dims == 2); GGML_ASSERT(tokens_input->n_dims == 2);
GGML_ASSERT( targets->n_dims == 3); GGML_ASSERT( targets->n_dims == 3);
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
@ -1380,7 +1382,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
} }
} }
void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
int n_tokens = tokens_input->ne[0]; int n_tokens = tokens_input->ne[0];
int n_vocab = targets->ne[0]; int n_vocab = targets->ne[0];
for (int i=0; i<n_tokens-n_shift; ++i) { for (int i=0; i<n_tokens-n_shift; ++i) {
@ -1391,12 +1393,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
} }
} }
struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { static struct ggml_tensor * square_error_loss(
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
) {
// todo: instead of a-b: a[1:]-b[:-1] // todo: instead of a-b: a[1:]-b[:-1]
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b))); return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
} }
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { static struct ggml_tensor * cross_entropy_loss(
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
) {
const float eps = 1e-3f; const float eps = 1e-3f;
return return
ggml_sum(ctx, ggml_sum(ctx,

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} beam-search.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <cassert> #include <cassert>
#include <cinttypes> #include <cinttypes>
@ -30,7 +29,8 @@ struct ostream_beam_view {
llama_context * ctx; llama_context * ctx;
llama_beam_view beam_view; llama_beam_view beam_view;
}; };
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
@ -46,7 +46,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc. // For example, eob can be flagged due to maximum token length, stop words, etc.
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
} }
@ -56,7 +56,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met. // This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr); auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed. // Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {

View file

@ -1,7 +1,8 @@
set(TARGET benchmark) set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp) add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO)

View file

@ -1,5 +1,5 @@
#include "common.h"
#include "ggml.h" #include "ggml.h"
#include "build-info.h"
#include <locale.h> #include <locale.h>
#include <assert.h> #include <assert.h>
@ -99,7 +99,7 @@ int main(int argc, char ** argv) {
exit(1); exit(1);
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
printf("Starting Test\n"); printf("Starting Test\n");
// create the ggml context // create the ggml context

View file

@ -115,7 +115,7 @@ struct TransformerWeights {
} }
}; };
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy // we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim](); w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
} }
} }
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1; if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1; if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1; if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
return 0; return 0;
} }
void print_sample_weights(TransformerWeights *w){ static void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n"); printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]); printf("%f\n", w->token_embedding_table[0]);
printf("%f\n", w->rms_att_weight[0]); printf("%f\n", w->rms_att_weight[0]);
@ -324,7 +324,7 @@ struct train_params {
int mem_compute1_gb; int mem_compute1_gb;
}; };
void print_params(struct my_llama_hparams * params) { static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
printf("%s: n_embd: %d\n", __func__, params->n_embd); printf("%s: n_embd: %d\n", __func__, params->n_embd);
@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) {
printf("%s: n_rot: %d\n", __func__, params->n_rot); printf("%s: n_rot: %d\n", __func__, params->n_rot);
} }
void init_model(struct my_llama_model * model) { static void init_model(struct my_llama_model * model) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) {
} }
} }
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr; return *ptr;
} }
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
return *ptr; return *ptr;
} }
void print_row(struct ggml_tensor * probs, int i) { static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i); float p = get_f32_2d(probs, k, i);
printf(" %f", p); printf(" %f", p);
@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
printf("\n"); printf("\n");
} }
void print_matrix(struct ggml_tensor * probs) { static void print_matrix(struct ggml_tensor * probs) {
assert(probs->n_dims == 2); assert(probs->n_dims == 2);
for (int i = 0; i < probs->ne[1]; ++i) { for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) { for (int k = 0; k < probs->ne[0]; ++k) {
@ -531,7 +531,7 @@ struct llama_file {
} }
}; };
bool is_ggml_file(const char *filename) { static bool is_ggml_file(const char * filename) {
llama_file file(filename, "rb"); llama_file file(filename, "rb");
if (file.size < 4) { if (file.size < 4) {
return false; return false;
@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) {
return magic == GGUF_MAGIC; return magic == GGUF_MAGIC;
} }
static std::string llama_escape_whitespaces(const std::string& text) { static std::string llama_escape_whitespaces(const std::string & text) {
std::ostringstream out; std::ostringstream out;
for (char c : text) { for (char c : text) {
if (c == ' ') out << "\xe2\x96\x81"; if (c == ' ') out << "\xe2\x96\x81";
@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) {
return out.str(); return out.str();
} }
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
if (is_ggml_file(filename)) { if (is_ggml_file(filename)) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
} }
} }
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct; int ct;
switch (gg_weights->n_dims){ switch (gg_weights->n_dims){
case 1: case 1:
@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
} }
} }
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { static void save_as_llama_model(
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
) {
// convert AK weights into GG weights one by one. // convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings // w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor // float* -> struct ggml_tensor
@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
gguf_free(ctx); gguf_free(ctx);
} }
struct train_params get_default_train_params() { static struct train_params get_default_train_params() {
struct train_params params; struct train_params params;
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_llama2c_output_model = "ak_llama_model.bin";
@ -835,7 +837,7 @@ struct train_params get_default_train_params() {
return params; return params;
} }
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "options:\n"); fprintf(stderr, "options:\n");
@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
bool params_parse(int argc, char ** argv, struct train_params * params) { static bool params_parse(int argc, char ** argv, struct train_params * params) {
bool invalid_param = false; bool invalid_param = false;
bool reqd_param_found = false; bool reqd_param_found = false;
std::string arg; std::string arg;
@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
return true; return true;
} }
std::string basename(const std::string &path) { static std::string basename(const std::string &path) {
size_t pos = path.find_last_of("/\\"); size_t pos = path.find_last_of("/\\");
if (pos == std::string::npos) { if (pos == std::string::npos) {
return path; return path;

View file

@ -1,3 +1,4 @@
#include "common.h"
#include "embd-input.h" #include "embd-input.h"
#include <cassert> #include <cassert>
@ -22,7 +23,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
return nullptr; return nullptr;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = uint32_t(time(NULL)); params.seed = uint32_t(time(NULL));

View file

@ -3,7 +3,6 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
extern "C" { extern "C" {

View file

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <ctime> #include <ctime>
@ -17,7 +16,7 @@ int main(int argc, char ** argv) {
params.embedding = true; params.embedding = true;
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);

View file

@ -13,14 +13,14 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
template<typename T> template <typename T>
static std::string to_string(const T & val) { static std::string to_string(const T & val) {
std::stringstream ss; std::stringstream ss;
ss << val; ss << val;
return ss.str(); return ss.str();
} }
bool gguf_ex_write(const std::string & fname) { static bool gguf_ex_write(const std::string & fname) {
struct gguf_context * ctx = gguf_init_empty(); struct gguf_context * ctx = gguf_init_empty();
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12); gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) {
} }
// just read tensor info // just read tensor info
bool gguf_ex_read_0(const std::string & fname) { static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = { struct gguf_init_params params = {
/*.no_alloc = */ false, /*.no_alloc = */ false,
/*.ctx = */ NULL, /*.ctx = */ NULL,
@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) {
} }
// read and create ggml_context containing the tensors and their data // read and create ggml_context containing the tensors and their data
bool gguf_ex_read_1(const std::string & fname) { static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {

View file

@ -74,14 +74,6 @@ static T stdev(const std::vector<T> & v) {
return stdev; return stdev;
} }
static bool ggml_cpu_has_metal() {
#if defined(GGML_USE_METAL)
return true;
#else
return false;
#endif
}
static std::string get_cpu_info() { static std::string get_cpu_info() {
std::string id; std::string id;
#ifdef __linux__ #ifdef __linux__

51
examples/main-cmake-pkg/.gitignore vendored Normal file
View file

@ -0,0 +1,51 @@
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
*.gguf
*.log
.DS_Store
.build/
.cache/
.direnv/
.envrc
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/
build*/
out/
tmp/

View file

@ -0,0 +1,36 @@
cmake_minimum_required(VERSION 3.12)
project("main-cmake-pkg" C CXX)
set(TARGET main-cmake-pkg)
find_package(Llama 0.0.1 REQUIRED)
# Bake common functionality in with target. Because applications
# using the relocatable Llama package should be outside of the
# source tree, main-cmake-pkg pretends the dependencies are built-in.
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
add_library(common OBJECT
${_common_path}/common.h
${_common_path}/common.cpp
${_common_path}/console.h
${_common_path}/console.cpp
${_common_path}/grammar-parser.h
${_common_path}/grammar-parser.cpp
)
# WARNING: because build-info.h is auto-generated, it will only
# be available after the user has built the llama.cpp sources.
#
configure_file(${_common_path}/../build-info.h
${CMAKE_CURRENT_BINARY_DIR}/build-info.h
COPYONLY)
target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
${CMAKE_CURRENT_BINARY_DIR})
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
target_include_directories(${TARGET} PRIVATE ${_common_path})
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,37 @@
# llama.cpp/example/main-cmake-pkg
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
## Building
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
### Considerations
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
### Build llama.cpp and install to C:\LlamaCPP directory
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
```cmd
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
```
### Build main-cmake-pkg
```cmd
cd ..\examples\main-cmake-pkg
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/MyLlamaApp
```

View file

@ -144,7 +144,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. - `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
### Keep Prompt ### Keep Prompt
@ -274,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models.
### NUMA support ### NUMA support
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root. - `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
### Memory Float 32 ### Memory Float 32
@ -302,7 +302,6 @@ These options provide extra functionality and customization when running the LLa
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text. - `--verbose-prompt`: Print the prompt before generating text.
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.

View file

@ -41,7 +41,8 @@ static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens; static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false; static bool is_interacting = false;
void write_logfile(
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model, const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output, const std::vector<llama_token> & input_tokens, const std::string & output,
const std::vector<llama_token> & output_tokens const std::vector<llama_token> & output_tokens
@ -86,7 +87,7 @@ void write_logfile(
} }
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) { static void sigint_handler(int signo) {
if (signo == SIGINT) { if (signo == SIGINT) {
if (!is_interacting) { if (!is_interacting) {
is_interacting = true; is_interacting = true;
@ -148,6 +149,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);
@ -198,23 +200,6 @@ int main(int argc, char ** argv) {
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
} }
// determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
// uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) {
{
LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
}
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
return 0;
}
// export the cgraph and exit // export the cgraph and exit
if (params.export_cgraph) { if (params.export_cgraph) {
llama_eval_export(ctx, "llama.ggml"); llama_eval_export(ctx, "llama.ggml");

View file

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
@ -28,9 +27,10 @@ struct results_log_softmax {
float prob; float prob;
}; };
void write_logfile(const llama_context * ctx, const gpt_params & params, static void write_logfile(
const llama_model * model, const struct results_perplexity & results) { const llama_context * ctx, const gpt_params & params, const llama_model * model,
const struct results_perplexity & results
) {
if (params.logdir.empty()) { if (params.logdir.empty()) {
return; return;
} }
@ -76,7 +76,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params,
fclose(logfile); fclose(logfile);
} }
std::vector<float> softmax(const std::vector<float>& logits) { static std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size()); std::vector<float> probs(logits.size());
float max_logit = logits[0]; float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v); for (float v : logits) max_logit = std::max(max_logit, v);
@ -92,7 +92,7 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs; return probs;
} }
results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0]; float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
double sum_exp = 0.0; double sum_exp = 0.0;
@ -100,9 +100,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
} }
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, static void process_logits(
double & nll, double & nll2, float * logit_history, float * prob_history) { int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
) {
std::mutex mutex; std::mutex mutex;
int counter = 0; int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -130,7 +131,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
} }
results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
@ -260,8 +261,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params)
return {tokens, std::exp(nll / count), logit_history, prob_history}; return {tokens, std::exp(nll / count), logit_history, prob_history};
} }
results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
if (params.ppl_stride > 0) { if (params.ppl_stride > 0) {
return perplexity_v2(ctx, params); return perplexity_v2(ctx, params);
} }
@ -400,8 +400,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
return {tokens, ppl, logit_history, prob_history}; return {tokens, ppl, logit_history, prob_history};
} }
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, static std::vector<float> hellaswag_evaluate_tokens(
int n_vocab, int n_thread) { llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
) {
std::vector<float> result; std::vector<float> result;
result.reserve(tokens.size() * n_vocab); result.reserve(tokens.size() * n_vocab);
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch; size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
@ -421,7 +422,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec
return result; return result;
} }
void hellaswag_score(llama_context * ctx, const gpt_params & params) { static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// Calculates hellaswag score (acc_norm) from prompt // Calculates hellaswag score (acc_norm) from prompt
// //
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@ -668,7 +669,7 @@ int main(int argc, char ** argv) {
params.n_ctx += params.ppl_stride/2; params.n_ctx += params.ppl_stride/2;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);

View file

@ -2,4 +2,5 @@ set(TARGET quantize-stats)
add_executable(${TARGET} quantize-stats.cpp) add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,7 +1,6 @@
#include "ggml.h"
#include "build-info.h"
#define LLAMA_API_INTERNAL #define LLAMA_API_INTERNAL
#include "common.h"
#include "ggml.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
@ -34,8 +33,8 @@ struct quantize_stats_params {
std::vector<enum ggml_type> include_types; std::vector<enum ggml_type> include_types;
}; };
const size_t HISTOGRAM_BUCKETS = 150; constexpr size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03; constexpr double HISTOGRAM_RANGE = 0.03;
struct error_stats { struct error_stats {
size_t num_samples; size_t num_samples;
@ -44,8 +43,7 @@ struct error_stats {
uint64_t error_histogram[HISTOGRAM_BUCKETS]; uint64_t error_histogram[HISTOGRAM_BUCKETS];
}; };
static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
quantize_stats_params params; quantize_stats_params params;
fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
@ -71,7 +69,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
} }
// Check if a layer is included/excluded by command line // Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params & params, const std::string & layer) { static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) { for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) { if (std::regex_search(layer, std::regex(excluded))) {
return false; return false;
@ -86,7 +84,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la
} }
// Update error statistics given vectors with the before/after result of quantization // Update error statistics given vectors with the before/after result of quantization
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
for (int64_t i = 0; i < nelements; i++) { for (int64_t i = 0; i < nelements; i++) {
double diff = input[i] - output[i]; double diff = input[i] - output[i];
stats.total_error += diff * diff; stats.total_error += diff * diff;
@ -96,14 +94,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
stats.num_samples += nelements; stats.num_samples += nelements;
} }
void combine_error_stats(error_stats & into, const error_stats & from) { static void combine_error_stats(error_stats & into, const error_stats & from) {
into.num_samples += from.num_samples; into.num_samples += from.num_samples;
into.total_error += from.total_error; into.total_error += from.total_error;
if (from.max_error > into.max_error) into.max_error = from.max_error; if (from.max_error > into.max_error) into.max_error = from.max_error;
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i]; for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
} }
double find_quantile(const error_stats & stats, double quantile) { static double find_quantile(const error_stats & stats, double quantile) {
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
double accum = 0; double accum = 0;
@ -116,7 +114,7 @@ double find_quantile(const error_stats & stats, double quantile) {
return INFINITY; return INFINITY;
} }
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
double rmse = sqrt(stats.total_error / (double) stats.num_samples); double rmse = sqrt(stats.total_error / (double) stats.num_samples);
double median = find_quantile(stats, .5); double median = find_quantile(stats, .5);
double pct95 = find_quantile(stats, .95); double pct95 = find_quantile(stats, .95);
@ -143,17 +141,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
} }
void test_roundtrip_on_chunk( static void test_roundtrip_on_chunk(
const ggml_tensor * layer, const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
int64_t offset, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
int64_t chunk_size, ) {
const ggml_type_traits_t & qfns,
bool use_reference,
float * input_scratch,
char * quantized_scratch,
float * output_scratch,
error_stats & stats) {
if (layer->type == GGML_TYPE_F16) { if (layer->type == GGML_TYPE_F16) {
for (int i = 0; i < chunk_size; i++) { for (int i = 0; i < chunk_size; i++) {
input_scratch[i] = ggml_get_f32_1d(layer, i + offset); input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
@ -174,18 +165,11 @@ void test_roundtrip_on_chunk(
// Run quantization function for a single layer and update error stats // Run quantization function for a single layer and update error stats
void test_roundtrip_on_layer( static void test_roundtrip_on_layer(
std::string & name, std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
bool print_layer_stats, const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
const ggml_type_traits_t & qfns, std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
bool use_reference, ) {
const ggml_tensor * layer,
std::vector<float> & input_scratch,
std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch,
error_stats & total_error,
int max_thread = 0) {
assert(tensor_is_contiguous(layer)); assert(tensor_is_contiguous(layer));
error_stats layer_error {}; error_stats layer_error {};
uint64_t nelements = ggml_nelements(layer); uint64_t nelements = ggml_nelements(layer);
@ -314,7 +298,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
// load the model // load the model
fprintf(stderr, "Loading model\n"); fprintf(stderr, "Loading model\n");

View file

@ -2,6 +2,7 @@ set(TARGET quantize)
add_executable(${TARGET} quantize.cpp) add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO)

View file

@ -1,5 +1,4 @@
#include "build-info.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include <cstdio> #include <cstdio>
@ -40,7 +39,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
}; };
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str; std::string ftype_str;
for (auto ch : ftype_str_in) { for (auto ch : ftype_str_in) {
@ -72,7 +71,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
// usage: // usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// //
void usage(const char * executable) { static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@ -161,7 +160,7 @@ int main(int argc, char ** argv) {
} }
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
if (params.nthread > 0) { if (params.nthread > 0) {

View file

@ -1,6 +1,5 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <vector> #include <vector>
#include <cstdio> #include <cstdio>
@ -17,7 +16,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); print_build_info();
if (params.n_predict < 0) { if (params.n_predict < 0) {
params.n_predict = 16; params.n_predict = 16;

View file

@ -1099,8 +1099,9 @@ static json format_final_response(llama_server_context &llama, const std::string
return res; return res;
} }
static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs) static json format_partial_response(
{ llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
) {
json res = json{ json res = json{
{"content", content}, {"content", content},
{"stop", false}, {"stop", false},
@ -1231,7 +1232,7 @@ static void log_server_request(const Request &req, const Response &res)
}); });
} }
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) { static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx); return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
} }
@ -1241,7 +1242,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. // * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met. // This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. // Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data, llama_beams_state beams_state) { static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
auto & llama = *static_cast<llama_server_context*>(callback_data); auto & llama = *static_cast<llama_server_context*>(callback_data);
// Mark beams as EOS as needed. // Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
@ -1274,7 +1275,8 @@ struct token_translator {
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); } std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
}; };
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
{
auto & gtps = llama.generated_token_probs; auto & gtps = llama.generated_token_probs;
auto translator = token_translator{llama.ctx}; auto translator = token_translator{llama.ctx};
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); }; auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };

View file

@ -3,6 +3,3 @@ add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,5 +1,3 @@
#include "build-info.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
// load the draft model // load the draft model
params.model = params.model_draft; params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
// tokenize the prompt // tokenize the prompt
@ -81,7 +82,7 @@ int main(int argc, char ** argv) {
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft)); //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
// how many tokens to draft each time // how many tokens to draft each time
const int n_draft = params.n_draft; int n_draft = params.n_draft;
int n_predict = 0; int n_predict = 0;
int n_drafted = 0; int n_drafted = 0;
@ -130,6 +131,7 @@ int main(int argc, char ** argv) {
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted)); LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
int i_dft = 0; int i_dft = 0;
while (true) { while (true) {
// sample from the target model // sample from the target model
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
@ -173,6 +175,27 @@ int main(int argc, char ** argv) {
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
++n_past_dft; ++n_past_dft;
// heuristic for n_draft
{
const int n_draft_cur = (int) drafted.size();
const bool all_accepted = i_dft == n_draft_cur;
LOG("n_draft = %d\n", n_draft);
LOG("n_draft_cur = %d\n", n_draft_cur);
LOG("i_dft = %d\n", i_dft);
LOG("all_accepted = %d\n", all_accepted);
if (all_accepted && n_draft == n_draft_cur) {
LOG(" - max drafted tokens accepted - n_draft += 8\n");
n_draft = std::min(30, n_draft + 8);
} else if (all_accepted) {
LOG(" - partially drafted tokens accepted - no change\n");
} else {
LOG(" - drafted token rejected - n_draft -= 1\n");
n_draft = std::max(2, n_draft - 1);
}
}
drafted.clear(); drafted.clear();
drafted.push_back(id); drafted.push_back(id);

View file

@ -34,7 +34,7 @@
with pkgs; [ openblas ] with pkgs; [ openblas ]
); );
pkgs = import nixpkgs { inherit system; }; pkgs = import nixpkgs { inherit system; };
nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ]; nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
llama-python = llama-python =
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
postPatch = '' postPatch = ''
@ -45,6 +45,8 @@
postInstall = '' postInstall = ''
mv $out/bin/main $out/bin/llama mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server mv $out/bin/server $out/bin/llama-server
mkdir -p $out/include
cp ${src}/llama.h $out/include/
''; '';
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
in in

View file

@ -131,6 +131,10 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size; return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
} }
static bool ggml_is_view(struct ggml_tensor * t) {
return t->view_src != NULL;
}
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
@ -340,8 +344,8 @@ static void free_vmem(void * base_addr, size_t size) {
// allocate uncommitted virtual memory to measure the size of the graph // allocate uncommitted virtual memory to measure the size of the graph
static void alloc_measure_vmem(void ** base_addr, size_t * size) { static void alloc_measure_vmem(void ** base_addr, size_t * size) {
// 1TB for 64-bit, 1GB for 32-bit // 128GB for 64-bit, 1GB for 32-bit
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40; *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
do { do {
*base_addr = alloc_vmem(*size); *base_addr = alloc_vmem(*size);
if (*base_addr != NULL) { if (*base_addr != NULL) {
@ -401,10 +405,6 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
//////////// compute graph allocator //////////// compute graph allocator
static bool ggml_is_view(struct ggml_tensor * t) {
return t->view_src != NULL;
}
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
if (a->type != b->type) { if (a->type != b->type) {
return false; return false;

File diff suppressed because it is too large Load diff

View file

@ -63,7 +63,10 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(relu); GGML_METAL_DECL_KERNEL(relu);
GGML_METAL_DECL_KERNEL(gelu); GGML_METAL_DECL_KERNEL(gelu);
GGML_METAL_DECL_KERNEL(soft_max); GGML_METAL_DECL_KERNEL(soft_max);
GGML_METAL_DECL_KERNEL(soft_max_4);
GGML_METAL_DECL_KERNEL(diag_mask_inf); GGML_METAL_DECL_KERNEL(diag_mask_inf);
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
GGML_METAL_DECL_KERNEL(get_rows_f32);
GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_f16);
GGML_METAL_DECL_KERNEL(get_rows_q4_0); GGML_METAL_DECL_KERNEL(get_rows_q4_0);
GGML_METAL_DECL_KERNEL(get_rows_q4_1); GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@ -75,8 +78,10 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(get_rows_q6_K); GGML_METAL_DECL_KERNEL(get_rows_q6_K);
GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@ -85,6 +90,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
@ -142,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->n_buffers = 0; ctx->n_buffers = 0;
ctx->concur_list_len = 0; ctx->concur_list_len = 0;
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
#ifdef GGML_SWIFT #ifdef GGML_SWIFT
// load the default.metallib file // load the default.metallib file
@ -172,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
@ -218,7 +224,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(relu); GGML_METAL_ADD_KERNEL(relu);
GGML_METAL_ADD_KERNEL(gelu); GGML_METAL_ADD_KERNEL(gelu);
GGML_METAL_ADD_KERNEL(soft_max); GGML_METAL_ADD_KERNEL(soft_max);
GGML_METAL_ADD_KERNEL(soft_max_4);
GGML_METAL_ADD_KERNEL(diag_mask_inf); GGML_METAL_ADD_KERNEL(diag_mask_inf);
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
GGML_METAL_ADD_KERNEL(get_rows_f32);
GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_f16);
GGML_METAL_ADD_KERNEL(get_rows_q4_0); GGML_METAL_ADD_KERNEL(get_rows_q4_0);
GGML_METAL_ADD_KERNEL(get_rows_q4_1); GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@ -230,8 +239,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(get_rows_q6_K); GGML_METAL_ADD_KERNEL(get_rows_q6_K);
GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@ -240,6 +251,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
@ -286,7 +298,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(relu); GGML_METAL_DEL_KERNEL(relu);
GGML_METAL_DEL_KERNEL(gelu); GGML_METAL_DEL_KERNEL(gelu);
GGML_METAL_DEL_KERNEL(soft_max); GGML_METAL_DEL_KERNEL(soft_max);
GGML_METAL_DEL_KERNEL(soft_max_4);
GGML_METAL_DEL_KERNEL(diag_mask_inf); GGML_METAL_DEL_KERNEL(diag_mask_inf);
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
GGML_METAL_DEL_KERNEL(get_rows_f32);
GGML_METAL_DEL_KERNEL(get_rows_f16); GGML_METAL_DEL_KERNEL(get_rows_f16);
GGML_METAL_DEL_KERNEL(get_rows_q4_0); GGML_METAL_DEL_KERNEL(get_rows_q4_0);
GGML_METAL_DEL_KERNEL(get_rows_q4_1); GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@ -298,8 +313,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(get_rows_q6_K); GGML_METAL_DEL_KERNEL(get_rows_q6_K);
GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(norm); GGML_METAL_DEL_KERNEL(norm);
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row); GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@ -308,6 +325,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32); GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
@ -378,6 +396,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
for (int i = 0; i < ctx->n_buffers; ++i) { for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs; *offs = (size_t) ioffs;
@ -715,6 +734,7 @@ void ggml_metal_graph_compute(
case GGML_OP_ADD: case GGML_OP_ADD:
{ {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
// utilize float4 // utilize float4
GGML_ASSERT(ne00 % 4 == 0); GGML_ASSERT(ne00 % 4 == 0);
@ -722,6 +742,7 @@ void ggml_metal_graph_compute(
if (ggml_nelements(src1) == ne10) { if (ggml_nelements(src1) == ne10) {
// src1 is a row // src1 is a row
GGML_ASSERT(ne11 == 1);
[encoder setComputePipelineState:ctx->pipeline_add_row]; [encoder setComputePipelineState:ctx->pipeline_add_row];
} else { } else {
[encoder setComputePipelineState:ctx->pipeline_add]; [encoder setComputePipelineState:ctx->pipeline_add];
@ -738,6 +759,7 @@ void ggml_metal_graph_compute(
case GGML_OP_MUL: case GGML_OP_MUL:
{ {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
// utilize float4 // utilize float4
GGML_ASSERT(ne00 % 4 == 0); GGML_ASSERT(ne00 % 4 == 0);
@ -745,6 +767,7 @@ void ggml_metal_graph_compute(
if (ggml_nelements(src1) == ne10) { if (ggml_nelements(src1) == ne10) {
// src1 is a row // src1 is a row
GGML_ASSERT(ne11 == 1);
[encoder setComputePipelineState:ctx->pipeline_mul_row]; [encoder setComputePipelineState:ctx->pipeline_mul_row];
} else { } else {
[encoder setComputePipelineState:ctx->pipeline_mul]; [encoder setComputePipelineState:ctx->pipeline_mul];
@ -760,6 +783,8 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
{ {
GGML_ASSERT(ggml_is_contiguous(src0));
const float scale = *(const float *) src1->data; const float scale = *(const float *) src1->data;
[encoder setComputePipelineState:ctx->pipeline_scale]; [encoder setComputePipelineState:ctx->pipeline_scale];
@ -767,7 +792,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&scale length:sizeof(scale) atIndex:2]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
@ -779,7 +804,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
@ -799,7 +824,7 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst)/4;
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
@ -813,13 +838,16 @@ void ggml_metal_graph_compute(
{ {
const int nth = 32; const int nth = 32;
[encoder setComputePipelineState:ctx->pipeline_soft_max]; if (ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
} else {
[encoder setComputePipelineState:ctx->pipeline_soft_max];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
@ -827,14 +855,23 @@ void ggml_metal_graph_compute(
{ {
const int n_past = ((int32_t *)(dst->op_params))[0]; const int n_past = ((int32_t *)(dst->op_params))[0];
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; if (ne00%8 == 0) {
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
} else {
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&n_past length:sizeof(int) atIndex:4]; [encoder setBytes:&n_past length:sizeof(int) atIndex:4];
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; if (ne00%8 == 0) {
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
}
else {
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
}
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
@ -847,13 +884,14 @@ void ggml_metal_graph_compute(
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
if (ggml_is_contiguous(src0) && if (!ggml_is_transposed(src0) &&
ggml_is_contiguous(src1) && !ggml_is_transposed(src1) &&
src1t == GGML_TYPE_F32 && src1t == GGML_TYPE_F32 &&
[ctx->device supportsFamily:MTLGPUFamilyApple7] && [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
ne00%32 == 0 && ne00%32 == 0 &&
ne11 > 1) { ne11 > 1) {
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break; case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
@ -873,25 +911,38 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8]; [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9]; [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
[encoder setThreadgroupMemoryLength:8192 atIndex:0]; [encoder setThreadgroupMemoryLength:8192 atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} else { } else {
int nth0 = 32; int nth0 = 32;
int nth1 = 1; int nth1 = 1;
int nrows = 1;
// use custom matrix x vector kernel // use custom matrix x vector kernel
switch (src0t) { switch (src0t) {
case GGML_TYPE_F32:
{
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
nrows = 4;
} break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
nth0 = 32; nth0 = 32;
nth1 = 1; nth1 = 1;
if (ne11 * ne12 < 4) { if (ne11 * ne12 < 4) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
nrows = ne11;
} else { } else {
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
nrows = 4;
} }
} break; } break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
@ -1012,7 +1063,7 @@ void ggml_metal_graph_compute(
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
int64_t ny = (ne11 + 3)/4; int64_t ny = (ne11 + nrows - 1)/nrows;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
} }
@ -1020,6 +1071,7 @@ void ggml_metal_graph_compute(
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
{ {
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break;
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
@ -1035,9 +1087,9 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
[encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5]; [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
const int64_t n = ggml_nelements(src1); const int64_t n = ggml_nelements(src1);

View file

@ -38,7 +38,7 @@ kernel void kernel_add_row(
device const float4 * src0, device const float4 * src0,
device const float4 * src1, device const float4 * src1,
device float4 * dst, device float4 * dst,
constant int64_t & nb, constant int64_t & nb,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig % nb]; dst[tpig] = src0[tpig] + src1[tpig % nb];
} }
@ -63,18 +63,18 @@ kernel void kernel_mul_row(
} }
kernel void kernel_scale( kernel void kernel_scale(
device const float * src0, device const float4 * src0,
device float * dst, device float4 * dst,
constant float & scale, constant float & scale,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * scale; dst[tpig] = src0[tpig] * scale;
} }
kernel void kernel_silu( kernel void kernel_silu(
device const float * src0, device const float4 * src0,
device float * dst, device float4 * dst,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig]; device const float4 & x = src0[tpig];
dst[tpig] = x / (1.0f + exp(-x)); dst[tpig] = x / (1.0f + exp(-x));
} }
@ -89,10 +89,10 @@ constant float GELU_COEF_A = 0.044715f;
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
kernel void kernel_gelu( kernel void kernel_gelu(
device const float * src0, device const float4 * src0,
device float * dst, device float4 * dst,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig]; device const float4 & x = src0[tpig];
// BEWARE !!! // BEWARE !!!
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs! // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
@ -107,7 +107,6 @@ kernel void kernel_soft_max(
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02, constant int64_t & ne02,
threadgroup float * buf [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) { uint3 ntg[[threads_per_threadgroup]]) {
@ -119,64 +118,70 @@ kernel void kernel_soft_max(
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
// parallel max // parallel max
buf[tpitg[0]] = -INFINITY; float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]); lmax = MAX(lmax, psrc0[i00]);
} }
const float max = simd_max(lmax);
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg[0]/2; i > 0; i /= 2) {
if (tpitg[0] < i) {
buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
// the loop, and when that is done, buf[0] has the correct (synchronized) value
//if (tpitg[0] == 0) {
// buf[0] = buf[0];
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float max = buf[0];
// parallel sum // parallel sum
buf[tpitg[0]] = 0.0f; float lsum = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
const float exp_psrc0 = exp(psrc0[i00] - max); const float exp_psrc0 = exp(psrc0[i00] - max);
buf[tpitg[0]] += exp_psrc0; lsum += exp_psrc0;
// Remember the result of exp here. exp is expensive, so we really do not // Remember the result of exp here. exp is expensive, so we really do not
// whish to compute it twice. // whish to compute it twice.
pdst[i00] = exp_psrc0; pdst[i00] = exp_psrc0;
} }
// reduce const float sum = simd_sum(lsum);
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg[0]/2; i > 0; i /= 2) {
if (tpitg[0] < i) {
buf[tpitg[0]] += buf[tpitg[0] + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
// broadcast - not needed, see above
//// broadcast
//if (tpitg[0] == 0) {
// buf[0] = buf[0];
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float sum = buf[0];
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
pdst[i00] /= sum; pdst[i00] /= sum;
} }
} }
kernel void kernel_soft_max_4(
device const float * src0,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig[2];
const int64_t i02 = tgpig[1];
const int64_t i01 = tgpig[0];
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
// parallel max
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
lmax4 = fmax(lmax4, psrc4[i00]);
}
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
const float max = simd_max(lmax);
// parallel sum
float4 lsum4 = 0.0f;
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
const float4 exp_psrc4 = exp(psrc4[i00] - max);
lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4;
}
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
const float sum = simd_sum(lsum);
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
pdst4[i00] /= sum;
}
}
kernel void kernel_diag_mask_inf( kernel void kernel_diag_mask_inf(
device const float * src0, device const float * src0,
device float * dst, device float * dst,
@ -192,6 +197,33 @@ kernel void kernel_diag_mask_inf(
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY; dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
} else { } else {
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00]; dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
}
}
kernel void kernel_diag_mask_inf_8(
device const float4 * src0,
device float4 * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int & n_past,
uint3 tpig[[thread_position_in_grid]]) {
const int64_t i = 2*tpig[0];
dst[i+0] = src0[i+0];
dst[i+1] = src0[i+1];
int64_t i4 = 4*i;
const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
const int64_t i01 = i4/(ne00); i4 -= i01*ne00;
const int64_t i00 = i4;
for (int k = 3; k >= 0; --k) {
if (i00 + 4 + k <= n_past + i01) {
break;
}
dst[i+1][k] = -INFINITY;
if (i00 + k > n_past + i01) {
dst[i][k] = -INFINITY;
}
} }
} }
@ -491,6 +523,79 @@ kernel void kernel_mul_mat_q8_0_f32(
} }
} }
#define N_F32_F32 4
kernel void kernel_mul_mat_f32_f32(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
const int64_t r0 = tgpig.x;
const int64_t rb = tgpig.y*N_F32_F32;
const int64_t im = tgpig.z;
device const float * x = (device const float *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
if (ne00 < 128) {
for (int row = 0; row < N_F32_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
float sumf = 0;
for (int i = tiisg; i < ne00; i += 32) {
sumf += (float) x[i] * (float) y[i];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
} else {
device const float4 * x4 = (device const float4 *)x;
for (int row = 0; row < N_F32_F32; ++row) {
int r1 = rb + row;
if (r1 >= ne11) {
break;
}
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
device const float4 * y4 = (device const float4 *) y;
float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
}
kernel void kernel_mul_mat_f16_f32_1row( kernel void kernel_mul_mat_f16_f32_1row(
device const char * src0, device const char * src0,
device const char * src1, device const char * src1,
@ -616,6 +721,49 @@ kernel void kernel_mul_mat_f16_f32(
} }
} }
// Assumes row size (ne00) is a multiple of 4
kernel void kernel_mul_mat_f16_f32_l4(
device const char * src0,
device const char * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]]) {
const int nrows = ne11;
const int64_t r0 = tgpig.x;
const int64_t im = tgpig.z;
device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
for (int r1 = 0; r1 < nrows; ++r1) {
device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
float sumf = 0;
for (int i = tiisg; i < ne00/4; i += 32) {
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
}
float all_sum = simd_sum(sumf);
if (tiisg == 0) {
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
}
}
}
kernel void kernel_alibi_f32( kernel void kernel_alibi_f32(
device const float * src0, device const float * src0,
device float * dst, device float * dst,
@ -1246,7 +1394,6 @@ kernel void kernel_mul_mat_q3_K_f32(
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row]; dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
} }
} }
} }
#else #else
kernel void kernel_mul_mat_q3_K_f32( kernel void kernel_mul_mat_q3_K_f32(
@ -1325,13 +1472,13 @@ kernel void kernel_mul_mat_q4_K_f32(
device const float * src1, device const float * src1,
device float * dst, device float * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01[[buffer(4)]], constant int64_t & ne01 [[buffer(4)]],
constant int64_t & ne02[[buffer(5)]], constant int64_t & ne02 [[buffer(5)]],
constant int64_t & ne10[[buffer(9)]], constant int64_t & ne10 [[buffer(9)]],
constant int64_t & ne12[[buffer(11)]], constant int64_t & ne12 [[buffer(11)]],
constant int64_t & ne0[[buffer(15)]], constant int64_t & ne0 [[buffer(15)]],
constant int64_t & ne1[[buffer(16)]], constant int64_t & ne1 [[buffer(16)]],
constant uint & gqa[[buffer(17)]], constant uint & gqa [[buffer(17)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]], uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) { uint sgitg[[simdgroup_index_in_threadgroup]]) {
@ -1790,6 +1937,15 @@ kernel void kernel_mul_mat_q6_K_f32(
//============================= templates and their specializations ============================= //============================= templates and their specializations =============================
// NOTE: this is not dequantizing - we are simply fitting the template
template <typename type4x4>
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
float4x4 temp = *(((device float4x4 *)src));
for (int i = 0; i < 16; i++){
reg[i/4][i%4] = temp[i/4][i%4];
}
}
template <typename type4x4> template <typename type4x4>
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
half4x4 temp = *(((device half4x4 *)src)); half4x4 temp = *(((device half4x4 *)src));
@ -1801,28 +1957,30 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
template <typename type4x4> template <typename type4x4>
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 1); device const uint16_t * qs = ((device const uint16_t *)xb + 1);
const half d = il ? (xb->d / 16.h) : xb->d; const float d1 = il ? (xb->d / 16.h) : xb->d;
const half m = il ? ( -8.h * 16.h) : -8.h; const float d2 = d1 / 256.f;
const float md = -8.h * xb->d;
const ushort mask0 = il ? 0x00F0 : 0x000F; const ushort mask0 = il ? 0x00F0 : 0x000F;
const ushort mask1 = il ? 0xF000 : 0x0F00; const ushort mask1 = mask0 << 8;
for (int i=0;i<8;i++) { for (int i=0;i<8;i++) {
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) + m) * d; reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d; reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
} }
} }
template <typename type4x4> template <typename type4x4>
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 2); device const uint16_t * qs = ((device const uint16_t *)xb + 2);
const half d = il ? (xb->d / 16.h) : xb->d; const float d1 = il ? (xb->d / 16.h) : xb->d;
const half m = xb->m; const float d2 = d1 / 256.f;
const float m = xb->m;
const ushort mask0 = il ? 0x00F0 : 0x000F; const ushort mask0 = il ? 0x00F0 : 0x000F;
const ushort mask1 = il ? 0xF000 : 0x0F00; const ushort mask1 = mask0 << 8;
for (int i=0;i<8;i++) { for (int i=0;i<8;i++) {
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) * d) + m; reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m; reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
} }
} }
@ -1858,7 +2016,7 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
template <typename type4x4> template <typename type4x4>
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
const float d_all = (float)(xb->d); const half d_all = xb->d;
device const uint8_t * q = (device const uint8_t *)xb->qs; device const uint8_t * q = (device const uint8_t *)xb->qs;
device const uint8_t * h = (device const uint8_t *)xb->hmask; device const uint8_t * h = (device const uint8_t *)xb->hmask;
device const int8_t * scales = (device const int8_t *)xb->scales; device const int8_t * scales = (device const int8_t *)xb->scales;
@ -1871,16 +2029,18 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
((il/4)>0 ? 12 : 3); ((il/4)>0 ? 12 : 3);
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : \ int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
(scale_2&kmask2) | ((scale_1&kmask1) << 4); : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
const half ml = 4.h * dl;
il = (il/2)%4; il = (il/2) & 3;
float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
dl *= coef;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i] & m) ? 0 : 4.f/coef)); reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
} }
#else #else
float kcoef = il&1 ? 1.f/16.f : 1.f; float kcoef = il&1 ? 1.f/16.f : 1.f;
@ -1895,26 +2055,31 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
#endif #endif
} }
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
: uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
}
template <typename type4x4> template <typename type4x4>
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
device const uint8_t * q = xb->qs; device const uchar * q = xb->qs;
#if QK_K == 256 #if QK_K == 256
const float d = (float)(xb->d);
const float min = (float)(xb->dmin);
short is = (il/4) * 2; short is = (il/4) * 2;
q = q + (il/4) * 32 + 16 * (il&1); q = q + (il/4) * 32 + 16 * (il&1);
il = il%4; il = il & 3;
const uchar4 sc = get_scale_min_k4(is, xb->scales); const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; const half d = il < 2 ? xb->d : xb->d / 16.h;
const float ml = il<2 ? min * sc[1] : min * sc[3]; const half min = xb->dmin;
const half dl = d * sc[0];
const half ml = min * sc[1];
#else #else
q = q + 16 * (il&1); q = q + 16 * (il&1);
device const uint8_t * s = xb->scales; device const uint8_t * s = xb->scales;
device const half2 * dh = (device const half2 *)xb->d; device const half2 * dh = (device const half2 *)xb->d;
const float2 d = (float2)dh[0]; const float2 d = (float2)dh[0];
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h; const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1 ]* (s[1]>>4); const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
#endif #endif
const ushort mask = il<2 ? 0x0F : 0xF0; const ushort mask = il<2 ? 0x0F : 0xF0;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
@ -1928,19 +2093,19 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
device const uint8_t * qh = xb->qh; device const uint8_t * qh = xb->qh;
#if QK_K == 256 #if QK_K == 256
const float d = (float)(xb->d);
const float min = (float)(xb->dmin);
short is = (il/4) * 2; short is = (il/4) * 2;
q = q + 32 * (il/4) + 16 * (il&1); q = q + 32 * (il/4) + 16 * (il&1);
qh = qh + 16 * (il&1); qh = qh + 16 * (il&1);
uint8_t ul = 1 << (il/2); uint8_t ul = 1 << (il/2);
il = il%4; il = il & 3;
const uchar4 sc = get_scale_min_k4(is, xb->scales); const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; const half d = il < 2 ? xb->d : xb->d / 16.h;
const float ml = il<2 ? min * sc[1] : min * sc[3]; const half min = xb->dmin;
const half dl = d * sc[0];
const half ml = min * sc[1];
const ushort mask = il<2 ? 0x0F : 0xF0; const ushort mask = il<2 ? 0x0F : 0xF0;
const float qh_val = il<2 ? 16.f : 256.f; const half qh_val = il<2 ? 16.h : 256.h;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
} }
@ -1959,7 +2124,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
template <typename type4x4> template <typename type4x4>
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
const float d_all = (float)(xb->d); const half d_all = xb->d;
device const uint8_t * ql = (device const uint8_t *)xb->ql; device const uint8_t * ql = (device const uint8_t *)xb->ql;
device const uint8_t * qh = (device const uint8_t *)xb->qh; device const uint8_t * qh = (device const uint8_t *)xb->qh;
device const int8_t * scales = (device const int8_t *)xb->scales; device const int8_t * scales = (device const int8_t *)xb->scales;
@ -1967,19 +2132,21 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
#if QK_K == 256 #if QK_K == 256
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
qh = qh + 32*(il/8) + 16*(il&1); qh = qh + 32*(il/8) + 16*(il&1);
float sc = scales[(il%2) + 2 * ((il/2))]; half sc = scales[(il%2) + 2 * ((il/2))];
il = (il/2)%4; il = (il/2) & 3;
#else #else
ql = ql + 16 * (il&1); ql = ql + 16 * (il&1);
float sc = scales[il]; half sc = scales[il];
#endif #endif
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
const half coef = il>1 ? 1.f/16.h : 1.h;
const half ml = d_all * sc * 32.h;
const half dl = d_all * sc * coef;
for (int i = 0; i < 16; ++i) { for (int i = 0; i < 16; ++i) {
uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
const float coef = il>1 ? 1.f/16.f : 1.f; reg[i/4][i%4] = dl * q - ml;
float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \
((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef;
reg[i/4][i%4] = d_all * sc * q * coef;
} }
} }
@ -2019,22 +2186,25 @@ kernel void kernel_get_rows(
// each block_q contains 16*nl weights // each block_q contains 16*nl weights
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)> template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
kernel void kernel_mul_mm(device const uchar * src0, kernel void kernel_mul_mm(device const uchar * src0,
device const float * src1, device const uchar * src1,
device float * dst, device float * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne02, constant int64_t & ne02,
constant int64_t & nb01, constant int64_t & nb01,
constant int64_t & nb02, constant int64_t & nb02,
constant int64_t & ne12, constant int64_t & ne12,
constant int64_t & ne0, constant int64_t & nb10,
constant int64_t & ne1, constant int64_t & nb11,
constant uint & gqa, constant int64_t & nb12,
threadgroup uchar * shared_memory [[threadgroup(0)]], constant int64_t & ne0,
uint3 tgpig[[threadgroup_position_in_grid]], constant int64_t & ne1,
uint tiitg[[thread_index_in_threadgroup]], constant uint & gqa,
uint sgitg[[simdgroup_index_in_threadgroup]]) { threadgroup uchar * shared_memory [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiitg[[thread_index_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
threadgroup half * sa = ((threadgroup half *)shared_memory); threadgroup half * sa = (threadgroup half *)(shared_memory);
threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
const uint r0 = tgpig.y; const uint r0 = tgpig.y;
@ -2047,7 +2217,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
simdgroup_half8x8 ma[4]; simdgroup_half8x8 ma[4];
simdgroup_float8x8 mb[2]; simdgroup_float8x8 mb[2];
simdgroup_float8x8 c_res[8]; simdgroup_float8x8 c_res[8];
for (int i = 0; i < 8; i++){ for (int i = 0; i < 8; i++){
@ -2055,10 +2225,15 @@ kernel void kernel_mul_mm(device const uchar * src0,
} }
short il = (tiitg % THREAD_PER_ROW); short il = (tiitg % THREAD_PER_ROW);
uint offset0 = im/gqa*nb02; ushort offset1 = il/nl;
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; uint offset0 = im/gqa*nb02;
device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \ ushort offset1 = il/nl;
+ BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1;
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
device const float * y = (device const float *)(src1
+ nb12 * im
+ nb11 * (r1 * BLOCK_SIZE_N + thread_col)
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
//load data and store to threadgroup memory //load data and store to threadgroup memory
@ -2138,6 +2313,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \ typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
constant uint64_t &, constant uint64_t &, uint, uint, uint); constant uint64_t &, constant uint64_t &, uint, uint, uint);
template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>; template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>; template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>; template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
@ -2148,14 +2324,28 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>; template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>; template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\ typedef void (mat_mm_t)(
constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \ device const uchar * src0,
constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint); device const uchar * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne02,
constant int64_t & nb01,
constant int64_t & nb02,
constant int64_t & ne12,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & gqa,
threadgroup uchar *, uint3, uint, uint);
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>; template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm<float4x4, 1, dequantize_f32>;
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>; template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>; template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>; template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>; template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>; template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>; template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;

110
ggml.c
View file

@ -284,7 +284,7 @@ typedef double ggml_float;
// 16-bit float // 16-bit float
// on Arm, we use __fp16 // on Arm, we use __fp16
// on x86, we use uint16_t // on x86, we use uint16_t
#ifdef __ARM_NEON #if defined(__ARM_NEON) && !defined(_MSC_VER)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
// //
@ -4356,10 +4356,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
} }
size_t ggml_nbytes(const struct ggml_tensor * tensor) { size_t ggml_nbytes(const struct ggml_tensor * tensor) {
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type); size_t nbytes;
for (int i = 1; i < GGML_MAX_DIMS; ++i) { size_t blck_size = ggml_blck_size(tensor->type);
nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; if (blck_size == 1) {
nbytes = ggml_type_size(tensor->type);
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
}
} }
else {
nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
}
}
return nbytes; return nbytes;
} }
@ -17891,10 +17902,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} else { } else {
// wait for other threads to finish // wait for other threads to finish
const int last = node_n; const int last = node_n;
do { while (true) {
//sched_yield(); // TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
sched_yield();
#endif
node_n = atomic_load(&state->shared->node_n); node_n = atomic_load(&state->shared->node_n);
} while (node_n == last); if (node_n != last) break;
};
} }
// check if we should stop // check if we should stop
@ -18956,10 +18975,11 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
for (int i = 0; i < cgraph->n_leafs; i++) { for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * node = cgraph->leafs[i]; struct ggml_tensor * node = cgraph->leafs[i];
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
i, i,
node->ne[0], node->ne[1], node->ne[0], node->ne[1],
ggml_op_name(node->op)); ggml_op_name(node->op),
ggml_get_name(node));
} }
for (int i = 0; i < GGML_OP_COUNT; i++) { for (int i = 0; i < GGML_OP_COUNT; i++) {
@ -20771,27 +20791,27 @@ const char * gguf_type_name(enum gguf_type type) {
return GGUF_TYPE_NAME[type]; return GGUF_TYPE_NAME[type];
} }
int gguf_get_version(struct gguf_context * ctx) { int gguf_get_version(const struct gguf_context * ctx) {
return ctx->header.version; return ctx->header.version;
} }
size_t gguf_get_alignment(struct gguf_context * ctx) { size_t gguf_get_alignment(const struct gguf_context * ctx) {
return ctx->alignment; return ctx->alignment;
} }
size_t gguf_get_data_offset(struct gguf_context * ctx) { size_t gguf_get_data_offset(const struct gguf_context * ctx) {
return ctx->offset; return ctx->offset;
} }
void * gguf_get_data(struct gguf_context * ctx) { void * gguf_get_data(const struct gguf_context * ctx) {
return ctx->data; return ctx->data;
} }
int gguf_get_n_kv(struct gguf_context * ctx) { int gguf_get_n_kv(const struct gguf_context * ctx) {
return ctx->header.n_kv; return ctx->header.n_kv;
} }
int gguf_find_key(struct gguf_context * ctx, const char * key) { int gguf_find_key(const struct gguf_context * ctx, const char * key) {
// return -1 if key not found // return -1 if key not found
int keyfound = -1; int keyfound = -1;
@ -20807,85 +20827,85 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
return keyfound; return keyfound;
} }
const char * gguf_get_key(struct gguf_context * ctx, int i) { const char * gguf_get_key(const struct gguf_context * ctx, int i) {
return ctx->kv[i].key.data; return ctx->kv[i].key.data;
} }
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) { enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) {
return ctx->kv[i].type; return ctx->kv[i].type;
} }
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) { enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.type; return ctx->kv[i].value.arr.type;
} }
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) { const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.data; return ctx->kv[i].value.arr.data;
} }
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) { const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
struct gguf_kv * kv = &ctx->kv[key_id]; struct gguf_kv * kv = &ctx->kv[key_id];
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
return str->data; return str->data;
} }
int gguf_get_arr_n(struct gguf_context * ctx, int i) { int gguf_get_arr_n(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.n; return ctx->kv[i].value.arr.n;
} }
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) { uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.uint8; return ctx->kv[i].value.uint8;
} }
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) { int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.int8; return ctx->kv[i].value.int8;
} }
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) { uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.uint16; return ctx->kv[i].value.uint16;
} }
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) { int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.int16; return ctx->kv[i].value.int16;
} }
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) { uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.uint32; return ctx->kv[i].value.uint32;
} }
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) { int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.int32; return ctx->kv[i].value.int32;
} }
float gguf_get_val_f32(struct gguf_context * ctx, int i) { float gguf_get_val_f32(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.float32; return ctx->kv[i].value.float32;
} }
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) { uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.uint64; return ctx->kv[i].value.uint64;
} }
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) { int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.int64; return ctx->kv[i].value.int64;
} }
double gguf_get_val_f64(struct gguf_context * ctx, int i) { double gguf_get_val_f64(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.float64; return ctx->kv[i].value.float64;
} }
bool gguf_get_val_bool(struct gguf_context * ctx, int i) { bool gguf_get_val_bool(const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.bool_; return ctx->kv[i].value.bool_;
} }
const char * gguf_get_val_str (struct gguf_context * ctx, int i) { const char * gguf_get_val_str (const struct gguf_context * ctx, int i) {
return ctx->kv[i].value.str.data; return ctx->kv[i].value.str.data;
} }
int gguf_get_n_tensors(struct gguf_context * ctx) { int gguf_get_n_tensors(const struct gguf_context * ctx) {
return ctx->header.n_tensors; return ctx->header.n_tensors;
} }
int gguf_find_tensor(struct gguf_context * ctx, const char * name) { int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
// return -1 if tensor not found // return -1 if tensor not found
int tensorfound = -1; int tensorfound = -1;
@ -20901,11 +20921,11 @@ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
return tensorfound; return tensorfound;
} }
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) { size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
return ctx->infos[i].offset; return ctx->infos[i].offset;
} }
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) { char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data; return ctx->infos[i].name.data;
} }
@ -21188,7 +21208,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
buf->offset += el_size; buf->offset += el_size;
} }
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
// write header // write header
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic)); gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version)); gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
@ -21303,7 +21323,7 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
} }
} }
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) { void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
FILE * file = fopen(fname, "wb"); FILE * file = fopen(fname, "wb");
if (!file) { if (!file) {
GGML_ASSERT(false && "failed to open file for writing"); GGML_ASSERT(false && "failed to open file for writing");
@ -21320,7 +21340,7 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only
fclose(file); fclose(file);
} }
size_t gguf_get_meta_size(struct gguf_context * ctx) { size_t gguf_get_meta_size(const struct gguf_context * ctx) {
// no allocs - only compute size // no allocs - only compute size
struct gguf_buf buf = gguf_buf_init(0); struct gguf_buf buf = gguf_buf_init(0);
@ -21329,7 +21349,7 @@ size_t gguf_get_meta_size(struct gguf_context * ctx) {
return buf.offset; return buf.offset;
} }
void gguf_get_meta_data(struct gguf_context * ctx, void * data) { void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
struct gguf_buf buf = gguf_buf_init(16*1024); struct gguf_buf buf = gguf_buf_init(16*1024);
gguf_write_to_buf(ctx, &buf, true); gguf_write_to_buf(ctx, &buf, true);
@ -21405,6 +21425,14 @@ int ggml_cpu_has_arm_fma(void) {
#endif #endif
} }
int ggml_cpu_has_metal(void) {
#if defined(GGML_USE_METAL)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_f16c(void) { int ggml_cpu_has_f16c(void) {
#if defined(__F16C__) #if defined(__F16C__)
return 1; return 1;

72
ggml.h
View file

@ -195,6 +195,14 @@
# define GGML_DEPRECATED(func, hint) func # define GGML_DEPRECATED(func, hint) func
#endif #endif
#ifndef __GNUC__
# define GGML_ATTRIBUTE_FORMAT(...)
#elif defined(__MINGW32__)
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#include <stdint.h> #include <stdint.h>
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
@ -704,6 +712,7 @@ extern "C" {
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
GGML_ATTRIBUTE_FORMAT(2, 3)
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
// //
@ -1905,39 +1914,39 @@ extern "C" {
GGML_API const char * gguf_type_name(enum gguf_type type); GGML_API const char * gguf_type_name(enum gguf_type type);
GGML_API int gguf_get_version (struct gguf_context * ctx); GGML_API int gguf_get_version (const struct gguf_context * ctx);
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx); GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx); GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
GGML_API void * gguf_get_data (struct gguf_context * ctx); GGML_API void * gguf_get_data (const struct gguf_context * ctx);
GGML_API int gguf_get_n_kv(struct gguf_context * ctx); GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key); GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i); GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i); GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i); GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
// results are undefined if the wrong type is used for the key // results are undefined if the wrong type is used for the key
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i); GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i); GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i); GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i); GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i); GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i); GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i); GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i); GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i); GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i); GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i); GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i); GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i); GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i); GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx); GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name); GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i); GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i); GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
// overrides existing values or adds a new one // overrides existing values or adds a new one
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
@ -1982,11 +1991,11 @@ extern "C" {
// //
// write the entire context to a binary file // write the entire context to a binary file
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta); GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx); GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data); GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
// //
// system info // system info
@ -2000,6 +2009,7 @@ extern "C" {
GGML_API int ggml_cpu_has_fma (void); GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void); GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_arm_fma (void); GGML_API int ggml_cpu_has_arm_fma (void);
GGML_API int ggml_cpu_has_metal (void);
GGML_API int ggml_cpu_has_f16c (void); GGML_API int ggml_cpu_has_f16c (void);
GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_wasm_simd (void);

View file

@ -77,12 +77,14 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):
LLAMA : int = auto() LLAMA : int = auto()
FALCON : int = auto() FALCON : int = auto()
GPT2 : int = auto() BAICHUAN : int = auto()
GPTJ : int = auto() GPT2 : int = auto()
GPTNEOX: int = auto() GPTJ : int = auto()
MPT : int = auto() GPTNEOX : int = auto()
MPT : int = auto()
STARCODER : int = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -106,12 +108,14 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.GPT2: "gpt2", MODEL_ARCH.BAICHUAN: "baichuan",
MODEL_ARCH.GPTJ: "gptj", MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTNEOX: "gptneox", MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.MPT: "mpt", MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder",
} }
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
@ -153,6 +157,34 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
}, },
MODEL_ARCH.BAICHUAN: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.STARCODER: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.GPT2: { MODEL_ARCH.GPT2: {
# TODO # TODO
}, },
@ -165,6 +197,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.ATTN_ROT_EMBD,
], ],
MODEL_ARCH.BAICHUAN: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
} }
@ -187,7 +223,7 @@ class TensorNameMap:
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf "lm_head", # gpt2 mpt falcon llama-hf baichuan
"output", # llama-pth "output", # llama-pth
), ),
@ -195,7 +231,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox "gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 falcon "transformer.ln_f", # gpt2 falcon
"model.norm", # llama-hf "model.norm", # llama-hf baichuan
"norm", # llama-pth "norm", # llama-pth
), ),
@ -311,6 +347,7 @@ class TensorNameMap:
tensor_name = tensor_names.get(tensor) tensor_name = tensor_names.get(tensor)
if tensor_name is None: if tensor_name is None:
continue continue
mapping[tensor_name] = (tensor, tensor_name)
for key in keys: for key in keys:
mapping[key] = (tensor, tensor_name) mapping[key] = (tensor, tensor_name)
for bid in range(n_blocks): for bid in range(n_blocks):
@ -319,11 +356,12 @@ class TensorNameMap:
if tensor_name is None: if tensor_name is None:
continue continue
tensor_name = tensor_name.format(bid = bid) tensor_name = tensor_name.format(bid = bid)
mapping[tensor_name] = (tensor, tensor_name)
for key in keys: for key in keys:
key = key.format(bid = bid) key = key.format(bid = bid)
mapping[key] = (tensor, tensor_name) mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None: def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
result = self.mapping.get(key) result = self.mapping.get(key)
if result is not None: if result is not None:
return result return result
@ -334,13 +372,13 @@ class TensorNameMap:
return (result[0], result[1] + suffix) return (result[0], result[1] + suffix)
return None return None
def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None: def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes) result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None: if result is None:
return None return None
return result[1] return result[1]
def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None: def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes) result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None: if result is None:
return None return None

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.3.2" version = "0.3.3"
description = "Write ML models in GGUF for GGML" description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

View file

@ -2609,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
memcpy(utmp, x[i].scales, 12); memcpy(utmp, x[i].scales, 12);
const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)}; uint32x2_t mins8 = { 0 };
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[0] &= kmask1; utmp[0] &= kmask1;

1133
llama.cpp

File diff suppressed because it is too large Load diff

View file

@ -389,6 +389,7 @@ extern "C" {
LLAMA_API int llama_tokenize( LLAMA_API int llama_tokenize(
struct llama_context * ctx, struct llama_context * ctx,
const char * text, const char * text,
int text_len,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
@ -396,6 +397,7 @@ extern "C" {
LLAMA_API int llama_tokenize_with_model( LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model, const struct llama_model * model,
const char * text, const char * text,
int text_len,
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
@ -555,7 +557,9 @@ extern "C" {
struct ggml_tensor; struct ggml_tensor;
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx); const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
#endif // LLAMA_API_INTERNAL #endif // LLAMA_API_INTERNAL

View file

@ -16,7 +16,7 @@
constexpr int kVecSize = 1 << 18; constexpr int kVecSize = 1 << 18;
float drawFromGaussianPdf(std::mt19937& rndm) { static float drawFromGaussianPdf(std::mt19937& rndm) {
constexpr double kScale = 1./(1. + std::mt19937::max()); constexpr double kScale = 1./(1. + std::mt19937::max());
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale; constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
static float lastX; static float lastX;
@ -28,7 +28,8 @@ float drawFromGaussianPdf(std::mt19937& rndm) {
haveX = true; haveX = true;
return r*cos(phi); return r*cos(phi);
} }
void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm); for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
} }

View file

@ -0,0 +1,4 @@
以下内容为人类用户与与一位智能助手的对话。
用户:你好!
助手:

View file

@ -13,7 +13,7 @@ CLI_ARGS_MAIN_PERPLEXITY = [
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct", "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base", "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock", "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
"model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q", "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt", "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n", "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed", "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",

View file

@ -0,0 +1,69 @@
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(LLAMA_BLAS @LLAMA_BLAS@)
set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
set(LLAMA_METAL @LLAMA_METAL@)
set(LLAMA_MPI @LLAMA_MPI@)
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
@PACKAGE_INIT@
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
# Ensure transient dependencies satisfied
find_package(Threads REQUIRED)
if (APPLE AND LLAMA_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
endif()
if (LLAMA_BLAS)
find_package(BLAS REQUIRED)
endif()
if (LLAMA_CUBLAS)
find_package(CUDAToolkit REQUIRED)
endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
endif()
if (LLAMA_MPI)
find_package(MPI REQUIRED)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast REQUIRED)
endif()
if (LLAMA_HIPBLAS)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
endif()
find_library(llama_LIBRARY llama
REQUIRED
HINTS ${LLAMA_LIB_DIR})
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
add_library(llama UNKNOWN IMPORTED)
set_target_properties(llama
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${llama_LIBRARY}"
INTERFACE_COMPILE_FEATURES cxx_std_11
POSITION_INDEPENDENT_CODE ON )
check_required_components(Llama)

View file

@ -2,6 +2,8 @@ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
set(BUILD_NUMBER 0) set(BUILD_NUMBER 0)
set(BUILD_COMMIT "unknown") set(BUILD_COMMIT "unknown")
set(BUILD_COMPILER "unknown")
set(BUILD_TARGET "unknown")
# Look for git # Look for git
find_package(Git) find_package(Git)
@ -41,11 +43,45 @@ if(Git_FOUND)
endif() endif()
endif() endif()
if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
set(BUILD_COMMIT ${HEAD})
set(BUILD_NUMBER ${COUNT})
endif()
execute_process(
COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES
)
if (RES EQUAL 0)
set(BUILD_COMPILER ${OUT})
endif()
execute_process(
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES
)
if (RES EQUAL 0)
set(BUILD_TARGET ${OUT})
endif()
# Only write the header if it's changed to prevent unnecessary recompilation # Only write the header if it's changed to prevent unnecessary recompilation
if(EXISTS ${HEADER_FILE}) if(EXISTS ${HEADER_FILE})
file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"") file(READ ${HEADER_FILE} CONTENTS)
list(GET CONTENTS 0 EXISTING) string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"") set(OLD_COMMIT ${CMAKE_MATCH_1})
string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
set(OLD_COMPILER ${CMAKE_MATCH_1})
string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
set(OLD_TARGET ${CMAKE_MATCH_1})
if (
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
NOT OLD_TARGET STREQUAL BUILD_TARGET
)
configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
endif() endif()
else() else()

View file

@ -3,5 +3,7 @@
#define BUILD_NUMBER @BUILD_NUMBER@ #define BUILD_NUMBER @BUILD_NUMBER@
#define BUILD_COMMIT "@BUILD_COMMIT@" #define BUILD_COMMIT "@BUILD_COMMIT@"
#define BUILD_COMPILER "@BUILD_COMPILER@"
#define BUILD_TARGET "@BUILD_TARGET@"
#endif // BUILD_INFO_H #endif // BUILD_INFO_H

View file

@ -1,23 +1,35 @@
#!/bin/sh #!/bin/sh
BUILD_NUMBER="0" CC=$1
BUILD_COMMIT="unknown"
REV_LIST=$(git rev-list --count HEAD) build_number="0"
if [ $? -eq 0 ]; then build_commit="unknown"
BUILD_NUMBER=$REV_LIST build_compiler="unknown"
build_target="unknown"
if out=$(git rev-list --count HEAD); then
# git is broken on WSL so we need to strip extra newlines
build_number=$(printf '%s' "$out" | tr -d '\n')
fi fi
REV_PARSE=$(git rev-parse --short HEAD) if out=$(git rev-parse --short HEAD); then
if [ $? -eq 0 ]; then build_commit=$(printf '%s' "$out" | tr -d '\n')
BUILD_COMMIT=$REV_PARSE fi
if out=$($CC --version | head -1); then
build_compiler=$out
fi
if out=$($CC -dumpmachine); then
build_target=$out
fi fi
echo "#ifndef BUILD_INFO_H" echo "#ifndef BUILD_INFO_H"
echo "#define BUILD_INFO_H" echo "#define BUILD_INFO_H"
echo "" echo
echo "#define BUILD_NUMBER $BUILD_NUMBER" | tr -d '\n' echo "#define BUILD_NUMBER $build_number"
echo "" echo "#define BUILD_COMMIT \"$build_commit\""
echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" | tr -d '\n' echo "#define BUILD_COMPILER \"$build_compiler\""
echo "" echo "#define BUILD_TARGET \"$build_target\""
echo
echo "#endif // BUILD_INFO_H" echo "#endif // BUILD_INFO_H"

View file

@ -29,9 +29,8 @@ llama_build_executable(test-tokenizer-0-llama.cpp)
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_build_executable(test-tokenizer-0-falcon.cpp) llama_build_executable(test-tokenizer-0-falcon.cpp)
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) #llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_build_executable(test-tokenizer-1.cpp) llama_build_executable(test-tokenizer-1-llama.cpp)
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one. llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) #llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
llama_build_and_test_executable(test-grammar-parser.cpp) llama_build_and_test_executable(test-grammar-parser.cpp)
llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-llama-grammar.cpp)

View file

@ -36,15 +36,15 @@
#define GGML_PRINT(...) printf(__VA_ARGS__) #define GGML_PRINT(...) printf(__VA_ARGS__)
float frand(void) { static float frand(void) {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
int irand(int n) { static int irand(int n) {
return rand()%n; return rand()%n;
} }
void get_random_dims(int64_t * dims, int ndims) { static void get_random_dims(int64_t * dims, int ndims) {
dims[0] = dims[1] = dims[2] = dims[3] = 1; dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) { for (int i = 0; i < ndims; i++) {
@ -52,7 +52,7 @@ void get_random_dims(int64_t * dims, int ndims) {
} }
} }
void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
dims[0] = dims[1] = dims[2] = dims[3] = 1; dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) { for (int i = 0; i < ndims; i++) {
@ -61,12 +61,9 @@ void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
} }
struct ggml_tensor * get_random_tensor( static struct ggml_tensor * get_random_tensor(
struct ggml_context * ctx0, struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
int ndims, ) {
int64_t ne[],
float fmin,
float fmax) {
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
switch (ndims) { switch (ndims) {
@ -109,11 +106,11 @@ struct ggml_tensor * get_random_tensor(
return result; return result;
} }
float get_element(const struct ggml_tensor * t, int idx) { static float get_element(const struct ggml_tensor * t, int idx) {
return ((float *)t->data)[idx]; return ((float *)t->data)[idx];
} }
void set_element(struct ggml_tensor * t, int idx, float value) { static void set_element(struct ggml_tensor * t, int idx, float value) {
((float *)t->data)[idx] = value; ((float *)t->data)[idx] = value;
} }

View file

@ -13,24 +13,24 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
const float MAX_DOT_PRODUCT_ERROR = 0.02f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
const char* RESULT_STR[] = {"ok", "FAILED"}; static const char* RESULT_STR[] = {"ok", "FAILED"};
// Generate synthetic data // Generate synthetic data
void generate_data(float offset, size_t n, float * dst) { static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset); dst[i] = 0.1 + 2*cosf(i + offset);
} }
} }
// Calculate RMSE between two float arrays // Calculate RMSE between two float arrays
float array_rmse(const float * a1, const float * a2, size_t n) { static float array_rmse(const float * a1, const float * a2, size_t n) {
double sum = 0; double sum = 0;
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
double diff = a1[i] - a2[i]; double diff = a1[i] - a2[i];
@ -40,7 +40,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
} }
// Total quantization error on test data // Total quantization error on test data
float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size); std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size); std::vector<float> tmp_out(test_size);
@ -50,7 +50,7 @@ float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, cons
} }
// Total quantization error on test data // Total quantization error on test data
float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size); std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size); std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size); std::vector<float> tmp_out_ref(test_size);
@ -64,7 +64,7 @@ float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size,
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
} }
float dot_product(const float * a1, const float * a2, size_t test_size) { static float dot_product(const float * a1, const float * a2, size_t test_size) {
double sum = 0; double sum = 0;
for (size_t i = 0; i < test_size; i++) { for (size_t i = 0; i < test_size; i++) {
sum += a1[i] * a2[i]; sum += a1[i] * a2[i];
@ -73,7 +73,9 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
} }
// Total dot product error // Total dot product error
float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) { static float dot_product_error(
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
) {
std::vector<uint8_t> tmp_q1(2*test_size); std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size); std::vector<uint8_t> tmp_q2(2*test_size);

View file

@ -61,22 +61,22 @@ inline int64_t cpu_cycles() {
// Generate synthetic data // Generate synthetic data
void generate_data(float offset, size_t n, float * dst) { static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) { for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset); dst[i] = 0.1 + 2*cosf(i + offset);
} }
} }
float gigabytes_per_second(size_t bytes, int64_t usecs) { static float gigabytes_per_second(size_t bytes, int64_t usecs) {
return bytes / (float) usecs * 1000000 / (1024*1024*1024); return bytes / (float) usecs * 1000000 / (1024*1024*1024);
} }
void * align_with_offset(void * ptr, int offset) { static void * align_with_offset(void * ptr, int offset) {
size_t dummy_size = MAX_ALIGNMENT * 4; size_t dummy_size = MAX_ALIGNMENT * 4;
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
} }
void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) { static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
int64_t min_time_us = INT64_MAX; int64_t min_time_us = INT64_MAX;
int64_t total_time_us = 0; int64_t total_time_us = 0;
int64_t min_time_cycles = INT64_MAX; int64_t min_time_cycles = INT64_MAX;
@ -108,7 +108,7 @@ void benchmark_function(size_t size, size_t q_size, int64_t iterations, const st
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us)); printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
} }
void usage(char * argv[]) { static void usage(char * argv[]) {
printf("Benchmark quantization specific functions on synthetic data\n"); printf("Benchmark quantization specific functions on synthetic data\n");
printf("\n"); printf("\n");
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);

View file

@ -12,7 +12,8 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
void dump(const llama_token_data_array * candidates) {
static void dump(const llama_token_data_array * candidates) {
for (size_t i = 0; i < candidates->size; i++) { for (size_t i = 0; i < candidates->size; i++) {
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
} }
@ -21,9 +22,7 @@ void dump(const llama_token_data_array * candidates) {
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
void test_top_k(const std::vector<float> & probs, static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
const std::vector<float> & expected_probs,
int k) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -45,10 +44,7 @@ void test_top_k(const std::vector<float> & probs,
} }
void test_top_p(const std::vector<float> & probs, static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const std::vector<float> & expected_probs,
float p) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -70,9 +66,7 @@ void test_top_p(const std::vector<float> & probs,
} }
void test_tfs(const std::vector<float> & probs, static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
const std::vector<float> & expected_probs,
float z) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -93,9 +87,7 @@ void test_tfs(const std::vector<float> & probs,
} }
void test_typical(const std::vector<float> & probs, static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const std::vector<float> & expected_probs,
float p) {
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -116,11 +108,10 @@ void test_typical(const std::vector<float> & probs,
} }
void test_repetition_penalty( static void test_repetition_penalty(
const std::vector<float> & probs, const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
const std::vector<llama_token> & last_tokens, const std::vector<float> & expected_probs, float penalty
const std::vector<float> & expected_probs, ) {
float penalty) {
assert(probs.size() == expected_probs.size()); assert(probs.size() == expected_probs.size());
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();
@ -145,11 +136,10 @@ void test_repetition_penalty(
} }
void test_frequency_presence_penalty( static void test_frequency_presence_penalty(
const std::vector<float> & probs, const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
const std::vector<llama_token> & last_tokens, const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
const std::vector<float> & expected_probs, ) {
float alpha_frequency, float alpha_presence) {
assert(probs.size() == expected_probs.size()); assert(probs.size() == expected_probs.size());
size_t n_vocab = probs.size(); size_t n_vocab = probs.size();

View file

@ -1,5 +1,6 @@
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "console.h"
#include <cstdio> #include <cstdio>
#include <string> #include <string>
@ -35,6 +36,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " Hello" , { 1678, 15043, }, }, { " Hello" , { 1678, 15043, }, },
{ " Hello" , { 268, 15043, }, }, { " Hello" , { 268, 15043, }, },
{ " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, },
{ " (" , { 29871, 313, }, },
}; };
return _k_tests; return _k_tests;
@ -89,6 +91,12 @@ int main(int argc, char **argv) {
return 2; return 2;
} }
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
bool success = true; bool success = true;
for (const auto & test_kv : k_tests()) { for (const auto & test_kv : k_tests()) {

View file

@ -0,0 +1,125 @@
#include "llama.h"
#include "common.h"
#include "console.h"
#include <cassert>
#include <cstdio>
#include <cstring>
#include <string>
#include <codecvt>
#include <map>
#include <vector>
#include <locale>
typedef int codepoint;
static std::string codepoint_to_utf8(codepoint cp) {
std::string result;
if (0x00 <= cp && cp <= 0x7f) {
result.push_back(cp);
} else if (0x80 <= cp && cp <= 0x7ff) {
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
result.push_back(0x80 | (cp & 0x3f));
} else if (0x800 <= cp && cp <= 0xffff) {
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
result.push_back(0x80 | ((cp >> 6) & 0x3f));
result.push_back(0x80 | (cp & 0x3f));
} else if (0x10000 <= cp && cp <= 0x10ffff) {
result.push_back(0xf0 | ((cp >> 18) & 0x07));
result.push_back(0x80 | ((cp >> 12) & 0x3f));
result.push_back(0x80 | ((cp >> 6) & 0x3f));
result.push_back(0x80 | (cp & 0x3f));
} else {
throw std::invalid_argument("invalid codepoint");
}
return result;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model;
llama_context * ctx;
llama_backend_init(false);
// load the vocab
{
auto lparams = llama_context_default_params();
lparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
ctx = llama_new_context_with_model(model, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM);
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
const int n_vocab = llama_n_vocab(ctx);
for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
return 2;
}
}
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
if (cp < 0xd800 || cp > 0xdfff) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (cp != 9601 && str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 3;
}
}
}
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 4;
}
}
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
return 0;
}

View file

@ -1,108 +0,0 @@
#include "llama.h"
#include "common.h"
#include <cassert>
#include <cstdio>
#include <cstring>
#include <string>
#include <codecvt>
#include <map>
#include <vector>
#include <locale>
static std::string escape_whitespace(const std::string& text) {
std::string result = "\xe2\x96\x81";
for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ') {
result += "\xe2\x96\x81";
} else {
result += text[offs];
}
}
return result;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model;
llama_context * ctx;
llama_backend_init(false);
// load the vocab
{
auto lparams = llama_context_default_params();
lparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
ctx = llama_new_context_with_model(model, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
const int n_vocab = llama_n_vocab(ctx);
for (int i = 0; i < n_vocab; ++i) {
std::string forward = llama_token_to_piece(ctx, i);
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
if (tokens.size() == 1) {
if (i != tokens[0]) {
std::string backward = llama_token_to_piece(ctx, tokens[0]);
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
return 2;
}
}
}
#ifdef _WIN32
std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
std::u16string u16str(1, ch);
std::string str = u16converter.to_bytes(u16str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n",
__func__, str.c_str(), tokens[0]);
}
}
std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
std::u32string u32str(1, ch);
std::string str = u32converter.to_bytes(u32str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
}
}
#endif
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
return 0;
}