Merge branch 'master' of https://github.com/ggerganov/llama.cpp into Markdownish_Codeblock_fix
This commit is contained in:
commit
34b1f00bdb
32 changed files with 2665 additions and 1001 deletions
131
.github/workflows/build.yml
vendored
131
.github/workflows/build.yml
vendored
|
@ -27,7 +27,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -52,7 +52,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -87,7 +87,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -121,7 +121,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -149,7 +149,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -174,7 +174,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -197,6 +197,62 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose --timeout 900
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
|
macOS-latest-cmake-ios:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -G Xcode .. \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=iOS \
|
||||||
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
||||||
|
cmake --build . --config Release
|
||||||
|
|
||||||
|
macOS-latest-cmake-tvos:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
sysctl -a
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -G Xcode .. \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
|
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||||
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
||||||
|
cmake --build . --config Release
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
@ -224,7 +280,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Download OpenCL SDK
|
- name: Download OpenCL SDK
|
||||||
id: get_opencl
|
id: get_opencl
|
||||||
|
@ -334,20 +390,19 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
cuda: ['12.1.0', '11.7.1']
|
cuda: ['12.2.0', '11.7.1']
|
||||||
build: ['cublas']
|
build: ['cublas']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- uses: Jimver/cuda-toolkit@v0.2.10
|
- uses: Jimver/cuda-toolkit@v0.2.11
|
||||||
id: cuda-toolkit
|
id: cuda-toolkit
|
||||||
with:
|
with:
|
||||||
cuda: ${{ matrix.cuda }}
|
cuda: ${{ matrix.cuda }}
|
||||||
# TODO(green-sky): _dev seems to fail, and non dev are not enought
|
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||||
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
@ -384,27 +439,11 @@ jobs:
|
||||||
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
- name: Copy and pack Cuda runtime
|
- name: Copy and pack Cuda runtime
|
||||||
if: ${{ matrix.cuda == '12.1.0' }}
|
|
||||||
# TODO(green-sky): paths are cuda 12 specific
|
|
||||||
run: |
|
run: |
|
||||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||||
mkdir '.\build\bin\cudart\'
|
$dst='.\build\bin\cudart\'
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
|
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
|
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
|
|
||||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
|
||||||
|
|
||||||
- name: Copy and pack Cuda runtime
|
|
||||||
if: ${{ matrix.cuda == '11.7.1' }}
|
|
||||||
# TODO(green-sky): paths are cuda 11 specific
|
|
||||||
run: |
|
|
||||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
|
||||||
mkdir '.\build\bin\cudart\'
|
|
||||||
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
|
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
|
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
|
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
|
|
||||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
|
||||||
|
|
||||||
- name: Upload Cuda runtime
|
- name: Upload Cuda runtime
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
@ -413,6 +452,22 @@ jobs:
|
||||||
path: |
|
path: |
|
||||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
|
freeBSD-latest:
|
||||||
|
runs-on: macos-12
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
uses: cross-platform-actions/action@v0.19.0
|
||||||
|
with:
|
||||||
|
operating_system: freebsd
|
||||||
|
version: '13.2'
|
||||||
|
run: |
|
||||||
|
sudo pkg update
|
||||||
|
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||||
|
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
|
||||||
|
|
||||||
release:
|
release:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
||||||
|
@ -429,7 +484,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -487,7 +542,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
# run: |
|
# run: |
|
||||||
|
@ -511,7 +566,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
# run: |
|
# run: |
|
||||||
|
@ -535,7 +590,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
# run: |
|
# run: |
|
||||||
|
@ -565,7 +620,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Add msbuild to PATH
|
# - name: Add msbuild to PATH
|
||||||
# uses: microsoft/setup-msbuild@v1
|
# uses: microsoft/setup-msbuild@v1
|
||||||
|
@ -604,7 +659,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Add msbuild to PATH
|
# - name: Add msbuild to PATH
|
||||||
# uses: microsoft/setup-msbuild@v1
|
# uses: microsoft/setup-msbuild@v1
|
||||||
|
@ -650,7 +705,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
# run: |
|
# run: |
|
||||||
|
|
15
.github/workflows/docker.yml
vendored
15
.github/workflows/docker.yml
vendored
|
@ -26,8 +26,15 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
|
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||||
|
# have disabled them for now until the reason why
|
||||||
|
# is understood.
|
||||||
|
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
@ -51,7 +58,7 @@ jobs:
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
||||||
|
@ -60,6 +67,6 @@ jobs:
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
2
.github/workflows/gguf-publish.yml
vendored
2
.github/workflows/gguf-publish.yml
vendored
|
@ -24,7 +24,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
|
|
108
CMakeLists.txt
108
CMakeLists.txt
|
@ -135,6 +135,7 @@ set(CMAKE_C_STANDARD 11)
|
||||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
|
include(CheckCXXCompilerFlag)
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
if (LLAMA_SANITIZE_THREAD)
|
if (LLAMA_SANITIZE_THREAD)
|
||||||
|
@ -171,8 +172,8 @@ if (LLAMA_METAL)
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
|
||||||
message(STATUS "Metal framework found")
|
message(STATUS "Metal framework found")
|
||||||
|
set(GGML_HEADERS_METAL ggml-metal.h)
|
||||||
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
|
set(GGML_SOURCES_METAL ggml-metal.m)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_METAL)
|
add_compile_definitions(GGML_USE_METAL)
|
||||||
if (LLAMA_METAL_NDEBUG)
|
if (LLAMA_METAL_NDEBUG)
|
||||||
|
@ -191,7 +192,6 @@ if (LLAMA_METAL)
|
||||||
${METALKIT_FRAMEWORK}
|
${METALKIT_FRAMEWORK}
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BLAS)
|
if (LLAMA_BLAS)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
set(BLA_STATIC ON)
|
set(BLA_STATIC ON)
|
||||||
|
@ -268,7 +268,8 @@ if (LLAMA_BLAS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_K_QUANTS)
|
if (LLAMA_K_QUANTS)
|
||||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
set(GGML_HEADERS_EXTRA k_quants.h)
|
||||||
|
set(GGML_SOURCES_EXTRA k_quants.c)
|
||||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||||
if (LLAMA_QKK_64)
|
if (LLAMA_QKK_64)
|
||||||
add_compile_definitions(GGML_QKK_64)
|
add_compile_definitions(GGML_QKK_64)
|
||||||
|
@ -284,7 +285,8 @@ if (LLAMA_CUBLAS)
|
||||||
|
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
||||||
|
set(GGML_SOURCES_CUDA ggml-cuda.cu)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
# if (LLAMA_CUDA_CUBLAS)
|
# if (LLAMA_CUDA_CUBLAS)
|
||||||
|
@ -332,6 +334,7 @@ if (LLAMA_MPI)
|
||||||
find_package(MPI)
|
find_package(MPI)
|
||||||
if (MPI_C_FOUND)
|
if (MPI_C_FOUND)
|
||||||
message(STATUS "MPI found")
|
message(STATUS "MPI found")
|
||||||
|
set(GGML_HEADERS_MPI ggml-mpi.h)
|
||||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
||||||
add_compile_definitions(GGML_USE_MPI)
|
add_compile_definitions(GGML_USE_MPI)
|
||||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||||
|
@ -354,7 +357,8 @@ if (LLAMA_CLBLAST)
|
||||||
if (CLBlast_FOUND)
|
if (CLBlast_FOUND)
|
||||||
message(STATUS "CLBlast found")
|
message(STATUS "CLBlast found")
|
||||||
|
|
||||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
|
set(GGML_HEADERS_OPENCL ggml-opencl.h)
|
||||||
|
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CLBLAST)
|
add_compile_definitions(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
|
@ -382,13 +386,15 @@ if (LLAMA_HIPBLAS)
|
||||||
message(STATUS "HIP and hipBLAS found")
|
message(STATUS "HIP and hipBLAS found")
|
||||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
||||||
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
endif()
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
|
|
||||||
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
||||||
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||||
|
|
||||||
|
@ -461,6 +467,13 @@ endif()
|
||||||
# TODO: probably these flags need to be tweaked on some architectures
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||||
|
if (MSVC)
|
||||||
|
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
|
||||||
|
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
|
||||||
|
else ()
|
||||||
|
set(CMAKE_GENERATOR_PLATFORM_LWR "")
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
add_link_options(-static)
|
add_link_options(-static)
|
||||||
|
@ -476,25 +489,33 @@ if (NOT MSVC)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||||
message(STATUS "ARM detected")
|
message(STATUS "ARM detected")
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
# TODO: arm msvc?
|
add_compile_definitions(__ARM_NEON)
|
||||||
|
add_compile_definitions(__ARM_FEATURE_FMA)
|
||||||
|
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||||
|
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
|
||||||
|
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
|
||||||
else()
|
else()
|
||||||
|
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||||
|
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||||
|
add_compile_options(-mfp16-format=ieee)
|
||||||
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||||
# Raspberry Pi 1, Zero
|
# Raspberry Pi 1, Zero
|
||||||
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
|
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||||
# Raspberry Pi 2
|
# Raspberry Pi 2
|
||||||
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
|
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||||
add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
|
add_compile_options(-mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||||
message(STATUS "x86 detected")
|
message(STATUS "x86 detected")
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
if (LLAMA_AVX512)
|
if (LLAMA_AVX512)
|
||||||
|
@ -578,10 +599,12 @@ endif()
|
||||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||||
# and on macOS its availability depends on enabling Darwin extensions
|
# and on macOS its availability depends on enabling Darwin extensions
|
||||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||||
if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
|
if (
|
||||||
add_compile_definitions(_DARWIN_C_SOURCE)
|
CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
|
||||||
endif()
|
CMAKE_SYSTEM_NAME MATCHES "iOS" OR
|
||||||
if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
|
CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
|
||||||
|
CMAKE_SYSTEM_NAME MATCHES "DragonFly"
|
||||||
|
)
|
||||||
add_compile_definitions(_DARWIN_C_SOURCE)
|
add_compile_definitions(_DARWIN_C_SOURCE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -614,11 +637,11 @@ add_library(ggml OBJECT
|
||||||
ggml.h
|
ggml.h
|
||||||
ggml-alloc.c
|
ggml-alloc.c
|
||||||
ggml-alloc.h
|
ggml-alloc.h
|
||||||
${GGML_SOURCES_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_MPI}
|
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||||
${GGML_SOURCES_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||||
|
@ -656,14 +679,53 @@ if (BUILD_SHARED_LIBS)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||||
endif()
|
endif()
|
||||||
install(TARGETS llama LIBRARY)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# install
|
# install
|
||||||
#
|
#
|
||||||
|
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
|
include(CMakePackageConfigHelpers)
|
||||||
|
|
||||||
|
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
|
||||||
|
CACHE PATH "Location of header files")
|
||||||
|
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
|
||||||
|
CACHE PATH "Location of library files")
|
||||||
|
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
|
||||||
|
CACHE PATH "Location of binary files")
|
||||||
|
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||||
|
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||||
|
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||||
|
|
||||||
|
configure_package_config_file(
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||||
|
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
|
||||||
|
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
|
||||||
|
LLAMA_LIB_INSTALL_DIR
|
||||||
|
LLAMA_BIN_INSTALL_DIR )
|
||||||
|
|
||||||
|
write_basic_package_version_file(
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||||
|
VERSION ${LLAMA_INSTALL_VERSION}
|
||||||
|
COMPATIBILITY SameMajorVersion)
|
||||||
|
|
||||||
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||||
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||||
|
|
||||||
|
set(GGML_PUBLIC_HEADERS "ggml.h"
|
||||||
|
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||||
|
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||||
|
|
||||||
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
|
install(TARGETS ggml PUBLIC_HEADER)
|
||||||
|
|
||||||
|
set_target_properties(llama PROPERTIES PUBLIC_HEADER llama.h)
|
||||||
|
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||||
|
|
||||||
install(
|
install(
|
||||||
FILES convert.py
|
FILES convert.py
|
||||||
PERMISSIONS
|
PERMISSIONS
|
||||||
|
|
33
Makefile
33
Makefile
|
@ -2,7 +2,7 @@
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
|
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -49,7 +49,7 @@ test: $(TEST_TARGETS)
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
||||||
continue; \
|
continue; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||||
continue; \
|
continue; \
|
||||||
else \
|
else \
|
||||||
echo "Running test $$test_target..."; \
|
echo "Running test $$test_target..."; \
|
||||||
|
@ -110,50 +110,42 @@ MK_LDFLAGS =
|
||||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||||
MK_CFLAGS += -D_XOPEN_SOURCE=600
|
MK_CPPFLAGS += -D_XOPEN_SOURCE=600
|
||||||
MK_CXXFLAGS += -D_XOPEN_SOURCE=600
|
|
||||||
|
|
||||||
# Somehow in OpenBSD whenever POSIX conformance is specified
|
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||||
# some string functions rely on locale_t availability,
|
# some string functions rely on locale_t availability,
|
||||||
# which was introduced in POSIX.1-2008, forcing us to go higher
|
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||||
ifeq ($(UNAME_S),OpenBSD)
|
ifeq ($(UNAME_S),OpenBSD)
|
||||||
MK_CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||||
MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Data types, macros and functions related to controlling CPU affinity and
|
# Data types, macros and functions related to controlling CPU affinity and
|
||||||
# some memory allocation are available on Linux through GNU extensions in libc
|
# some memory allocation are available on Linux through GNU extensions in libc
|
||||||
ifeq ($(UNAME_S),Linux)
|
ifeq ($(UNAME_S),Linux)
|
||||||
MK_CFLAGS += -D_GNU_SOURCE
|
MK_CPPFLAGS += -D_GNU_SOURCE
|
||||||
MK_CXXFLAGS += -D_GNU_SOURCE
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||||
# and on macOS its availability depends on enabling Darwin extensions
|
# and on macOS its availability depends on enabling Darwin extensions
|
||||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
MK_CFLAGS += -D_DARWIN_C_SOURCE
|
MK_CPPFLAGS += -D_DARWIN_C_SOURCE
|
||||||
MK_CXXFLAGS += -D_DARWIN_C_SOURCE
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(UNAME_S),DragonFly)
|
ifeq ($(UNAME_S),DragonFly)
|
||||||
MK_CFLAGS += -D__BSD_VISIBLE
|
MK_CPPFLAGS += -D__BSD_VISIBLE
|
||||||
MK_CXXFLAGS += -D__BSD_VISIBLE
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# alloca is a non-standard interface that is not visible on BSDs when
|
# alloca is a non-standard interface that is not visible on BSDs when
|
||||||
# POSIX conformance is specified, but not all of them provide a clean way
|
# POSIX conformance is specified, but not all of them provide a clean way
|
||||||
# to enable it in such cases
|
# to enable it in such cases
|
||||||
ifeq ($(UNAME_S),FreeBSD)
|
ifeq ($(UNAME_S),FreeBSD)
|
||||||
MK_CFLAGS += -D__BSD_VISIBLE
|
MK_CPPFLAGS += -D__BSD_VISIBLE
|
||||||
MK_CXXFLAGS += -D__BSD_VISIBLE
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(UNAME_S),NetBSD)
|
ifeq ($(UNAME_S),NetBSD)
|
||||||
MK_CFLAGS += -D_NETBSD_SOURCE
|
MK_CPPFLAGS += -D_NETBSD_SOURCE
|
||||||
MK_CXXFLAGS += -D_NETBSD_SOURCE
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(UNAME_S),OpenBSD)
|
ifeq ($(UNAME_S),OpenBSD)
|
||||||
MK_CFLAGS += -D_BSD_SOURCE
|
MK_CPPFLAGS += -D_BSD_SOURCE
|
||||||
MK_CXXFLAGS += -D_BSD_SOURCE
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
|
@ -182,7 +174,7 @@ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow
|
||||||
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
|
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
|
||||||
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
||||||
|
|
||||||
ifeq '' '$(findstring clang++,$(CXX))'
|
ifeq '' '$(findstring clang,$(shell $(CXX) --version))'
|
||||||
# g++ only
|
# g++ only
|
||||||
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
|
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
|
||||||
endif
|
endif
|
||||||
|
@ -408,7 +400,6 @@ ifdef LLAMA_HIPBLAS
|
||||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||||
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||||
HIPFLAGS += -DCC_TURING=1000000000
|
|
||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
endif # LLAMA_CUDA_FORCE_DMMV
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
|
@ -606,7 +597,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
|
|
|
@ -2,8 +2,30 @@
|
||||||
|
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
|
#if arch(arm) || arch(arm64)
|
||||||
|
let platforms: [SupportedPlatform]? = [
|
||||||
|
.macOS(.v11),
|
||||||
|
.iOS(.v14),
|
||||||
|
.watchOS(.v4),
|
||||||
|
.tvOS(.v14)
|
||||||
|
]
|
||||||
|
let exclude: [String] = []
|
||||||
|
let additionalSources: [String] = ["ggml-metal.m"]
|
||||||
|
let additionalSettings: [CSetting] = [
|
||||||
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
|
.define("GGML_SWIFT"),
|
||||||
|
.define("GGML_USE_METAL")
|
||||||
|
]
|
||||||
|
#else
|
||||||
|
let platforms: [SupportedPlatform]? = nil
|
||||||
|
let exclude: [String] = ["ggml-metal.metal"]
|
||||||
|
let additionalSources: [String] = []
|
||||||
|
let additionalSettings: [CSetting] = []
|
||||||
|
#endif
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
|
platforms: platforms,
|
||||||
products: [
|
products: [
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "llama", targets: ["llama"]),
|
||||||
],
|
],
|
||||||
|
@ -11,23 +33,23 @@ let package = Package(
|
||||||
.target(
|
.target(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
path: ".",
|
path: ".",
|
||||||
exclude: ["ggml-metal.metal"],
|
exclude: exclude,
|
||||||
sources: [
|
sources: [
|
||||||
"ggml.c",
|
"ggml.c",
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
"k_quants.c"
|
"k_quants.c",
|
||||||
],
|
] + additionalSources,
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
||||||
.define("GGML_USE_K_QUANTS"),
|
.define("GGML_USE_K_QUANTS"),
|
||||||
.define("GGML_USE_ACCELERATE")
|
.define("GGML_USE_ACCELERATE")
|
||||||
],
|
] + additionalSettings,
|
||||||
linkerSettings: [
|
linkerSettings: [
|
||||||
.linkedFramework("Accelerate")
|
.linkedFramework("Accelerate")
|
||||||
]
|
]
|
||||||
),
|
)
|
||||||
],
|
],
|
||||||
cxxLanguageStandard: .cxx11
|
cxxLanguageStandard: .cxx11
|
||||||
)
|
)
|
||||||
|
|
13
README.md
13
README.md
|
@ -844,8 +844,17 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
|
||||||
#### Images
|
#### Images
|
||||||
We have two Docker images available for this project:
|
We have two Docker images available for this project:
|
||||||
|
|
||||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
|
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
|
Additionally, there the following images, similar to the above:
|
||||||
|
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
#### Usage
|
#### Usage
|
||||||
|
|
||||||
|
|
|
@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
|
#endif
|
||||||
|
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
|
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
#endif
|
#endif
|
||||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -664,6 +675,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
printf(" -ngl N, --n-gpu-layers N\n");
|
printf(" -ngl N, --n-gpu-layers N\n");
|
||||||
printf(" number of layers to store in VRAM\n");
|
printf(" number of layers to store in VRAM\n");
|
||||||
|
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
||||||
|
printf(" number of layers to store in VRAM for the draft model\n");
|
||||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||||
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||||
|
|
|
@ -38,6 +38,7 @@ struct gpt_params {
|
||||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
|
292
convert-baichuan-hf-to-gguf.py
Executable file
292
convert-baichuan-hf-to-gguf.py
Executable file
|
@ -0,0 +1,292 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# HF baichuan --> gguf conversion
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
import itertools
|
||||||
|
import gguf
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from typing import TypeAlias
|
||||||
|
|
||||||
|
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||||
|
|
||||||
|
# reverse HF permute back to original pth layout
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
||||||
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
n_head //= n_kv_head
|
||||||
|
|
||||||
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
||||||
|
r = weights.shape[0] // 3
|
||||||
|
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
||||||
|
|
||||||
|
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
||||||
|
r = weights.shape[0] // 3
|
||||||
|
return weights[r * n_part : r * n_part + r, ...]
|
||||||
|
|
||||||
|
def count_model_parts(dir_model: str) -> int:
|
||||||
|
num_parts = 0
|
||||||
|
|
||||||
|
for filename in os.listdir(dir_model):
|
||||||
|
if filename.startswith("pytorch_model-"):
|
||||||
|
num_parts += 1
|
||||||
|
|
||||||
|
if num_parts > 0:
|
||||||
|
print("gguf: found " + str(num_parts) + " model parts")
|
||||||
|
|
||||||
|
return num_parts
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
||||||
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||||
|
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
dir_model = args.model
|
||||||
|
ftype = args.ftype
|
||||||
|
if not dir_model.is_dir():
|
||||||
|
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# possible tensor data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
if args.outfile is not None:
|
||||||
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
|
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||||
|
|
||||||
|
print("gguf: loading model "+dir_model.name)
|
||||||
|
|
||||||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
print("hello print: ",hparams["architectures"][0])
|
||||||
|
if hparams["architectures"][0] != "BaichuanForCausalLM":
|
||||||
|
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||||
|
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
# get number of model parts
|
||||||
|
num_parts = count_model_parts(dir_model)
|
||||||
|
print(f"num_parts:{num_parts}\n")
|
||||||
|
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
||||||
|
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
|
print("gguf: get model metadata")
|
||||||
|
|
||||||
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
head_count = hparams["num_attention_heads"]
|
||||||
|
|
||||||
|
if "num_key_value_heads" in hparams:
|
||||||
|
head_count_kv = hparams["num_key_value_heads"]
|
||||||
|
else:
|
||||||
|
head_count_kv = head_count
|
||||||
|
|
||||||
|
if "_name_or_path" in hparams:
|
||||||
|
hf_repo = hparams["_name_or_path"]
|
||||||
|
else:
|
||||||
|
hf_repo = ""
|
||||||
|
|
||||||
|
if "max_sequence_length" in hparams:
|
||||||
|
ctx_length = hparams["max_sequence_length"]
|
||||||
|
elif "max_position_embeddings" in hparams:
|
||||||
|
ctx_length = hparams["max_position_embeddings"]
|
||||||
|
elif "model_max_length" in hparams:
|
||||||
|
ctx_length = hparams["model_max_length"]
|
||||||
|
else:
|
||||||
|
print("gguf: can not find ctx length parameter.")
|
||||||
|
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
gguf_writer.add_name(dir_model.name)
|
||||||
|
gguf_writer.add_source_hf_repo(hf_repo)
|
||||||
|
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
|
gguf_writer.add_context_length(ctx_length)
|
||||||
|
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
|
gguf_writer.add_block_count(block_count)
|
||||||
|
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
|
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||||
|
gguf_writer.add_head_count(head_count)
|
||||||
|
gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
|
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
|
|
||||||
|
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||||
|
if "type" in hparams["rope_scaling"]:
|
||||||
|
if hparams["rope_scaling"]["type"] == "linear":
|
||||||
|
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
|
||||||
|
# TOKENIZATION
|
||||||
|
|
||||||
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
|
tokens: list[bytes] = []
|
||||||
|
scores: list[float] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
tokenizer_model_file = dir_model / 'tokenizer.model'
|
||||||
|
if not tokenizer_model_file.is_file():
|
||||||
|
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# vocab type sentencepiece
|
||||||
|
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
||||||
|
|
||||||
|
for i in range(tokenizer.vocab_size()):
|
||||||
|
text: bytes
|
||||||
|
score: float
|
||||||
|
|
||||||
|
piece = tokenizer.id_to_piece(i)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.get_score(i)
|
||||||
|
|
||||||
|
toktype = 1 # defualt to normal token type
|
||||||
|
if tokenizer.is_unknown(i):
|
||||||
|
toktype = 2
|
||||||
|
if tokenizer.is_control(i):
|
||||||
|
toktype = 3
|
||||||
|
|
||||||
|
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||||
|
|
||||||
|
if tokenizer.is_unused(i):
|
||||||
|
toktype = 5
|
||||||
|
if tokenizer.is_byte(i):
|
||||||
|
toktype = 6
|
||||||
|
|
||||||
|
tokens.append(text)
|
||||||
|
scores.append(score)
|
||||||
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
added_tokens_file = dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
addtokens_json = json.load(f)
|
||||||
|
|
||||||
|
print("gguf: get added tokens")
|
||||||
|
|
||||||
|
for key in addtokens_json:
|
||||||
|
tokens.append( key.encode("utf-8") )
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(4) # user-defined token type
|
||||||
|
|
||||||
|
|
||||||
|
gguf_writer.add_tokenizer_model("llama")
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model)
|
||||||
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
|
# TENSORS
|
||||||
|
|
||||||
|
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||||
|
|
||||||
|
# tensor info
|
||||||
|
print("gguf: get tensor metadata")
|
||||||
|
|
||||||
|
if num_parts == 0:
|
||||||
|
part_names = iter(("pytorch_model.bin",))
|
||||||
|
else:
|
||||||
|
part_names = (
|
||||||
|
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
for part_name in part_names:
|
||||||
|
if args.vocab_only:
|
||||||
|
break
|
||||||
|
print("gguf: loading model part '" + part_name + "'")
|
||||||
|
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||||
|
|
||||||
|
tmp=model_part
|
||||||
|
for i in range(block_count):
|
||||||
|
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
||||||
|
print(f"Unpacking and permuting layer {i}")
|
||||||
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
||||||
|
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
||||||
|
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
||||||
|
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
||||||
|
|
||||||
|
for name in model_part.keys():
|
||||||
|
data = model_part[name]
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith(".rotary_emb.inv_freq"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||||
|
data = data.to(torch.float32)
|
||||||
|
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print("Can not map tensor '" + name + "'")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||||
|
gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
print("gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print("gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
if not args.vocab_only:
|
||||||
|
print("gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||||
|
print("")
|
|
@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
18
convert.py
18
convert.py
|
@ -145,7 +145,6 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
n_embd: int
|
n_embd: int
|
||||||
n_mult: int
|
|
||||||
n_layer: int
|
n_layer: int
|
||||||
n_ctx: int
|
n_ctx: int
|
||||||
n_ff: int
|
n_ff: int
|
||||||
|
@ -161,15 +160,6 @@ class Params:
|
||||||
# path to the directory containing the model files
|
# path to the directory containing the model files
|
||||||
path_model: Path | None = None
|
path_model: Path | None = None
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
|
||||||
# hardcoded magic range
|
|
||||||
for n_mult in range(8192, 1, -1):
|
|
||||||
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
|
||||||
if calc_ff == n_ff:
|
|
||||||
return n_mult
|
|
||||||
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def guessed(model: LazyModel) -> Params:
|
def guessed(model: LazyModel) -> Params:
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
|
@ -197,7 +187,6 @@ class Params:
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_embd = n_embd,
|
n_embd = n_embd,
|
||||||
n_mult = n_mult,
|
|
||||||
n_layer = n_layer,
|
n_layer = n_layer,
|
||||||
n_ctx = -1,
|
n_ctx = -1,
|
||||||
n_ff = n_ff,
|
n_ff = n_ff,
|
||||||
|
@ -225,8 +214,6 @@ class Params:
|
||||||
else:
|
else:
|
||||||
f_rope_scale = None
|
f_rope_scale = None
|
||||||
|
|
||||||
n_mult = Params.find_n_mult(n_ff, n_embd)
|
|
||||||
|
|
||||||
if "max_sequence_length" in config:
|
if "max_sequence_length" in config:
|
||||||
n_ctx = config["max_sequence_length"]
|
n_ctx = config["max_sequence_length"]
|
||||||
elif "max_position_embeddings" in config:
|
elif "max_position_embeddings" in config:
|
||||||
|
@ -238,7 +225,6 @@ class Params:
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_embd = n_embd,
|
n_embd = n_embd,
|
||||||
n_mult = n_mult,
|
|
||||||
n_layer = n_layer,
|
n_layer = n_layer,
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = n_ff,
|
n_ff = n_ff,
|
||||||
|
@ -250,7 +236,7 @@ class Params:
|
||||||
)
|
)
|
||||||
|
|
||||||
# LLaMA v2 70B params.json
|
# LLaMA v2 70B params.json
|
||||||
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
|
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
@ -258,7 +244,6 @@ class Params:
|
||||||
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
|
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
|
||||||
n_embd = config["dim"]
|
n_embd = config["dim"]
|
||||||
n_layer = config["n_layers"]
|
n_layer = config["n_layers"]
|
||||||
n_mult = config["multiple_of"]
|
|
||||||
n_ff = -1
|
n_ff = -1
|
||||||
n_head = config["n_heads"]
|
n_head = config["n_heads"]
|
||||||
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
|
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
|
||||||
|
@ -285,7 +270,6 @@ class Params:
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_embd = n_embd,
|
n_embd = n_embd,
|
||||||
n_mult = n_mult,
|
|
||||||
n_layer = n_layer,
|
n_layer = n_layer,
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = n_ff,
|
n_ff = n_ff,
|
||||||
|
|
51
examples/main-cmake-pkg/.gitignore
vendored
Normal file
51
examples/main-cmake-pkg/.gitignore
vendored
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# Prerequisites
|
||||||
|
*.d
|
||||||
|
|
||||||
|
# Compiled Object files
|
||||||
|
*.slo
|
||||||
|
*.lo
|
||||||
|
*.o
|
||||||
|
*.obj
|
||||||
|
|
||||||
|
# Precompiled Headers
|
||||||
|
*.gch
|
||||||
|
*.pch
|
||||||
|
|
||||||
|
# Compiled Dynamic libraries
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
*.dll
|
||||||
|
|
||||||
|
# Fortran module files
|
||||||
|
*.mod
|
||||||
|
*.smod
|
||||||
|
|
||||||
|
# Compiled Static libraries
|
||||||
|
*.lai
|
||||||
|
*.la
|
||||||
|
*.a
|
||||||
|
*.lib
|
||||||
|
|
||||||
|
# Executables
|
||||||
|
*.exe
|
||||||
|
*.out
|
||||||
|
*.app
|
||||||
|
|
||||||
|
*.gguf
|
||||||
|
|
||||||
|
*.log
|
||||||
|
.DS_Store
|
||||||
|
.build/
|
||||||
|
.cache/
|
||||||
|
.direnv/
|
||||||
|
.envrc
|
||||||
|
.swiftpm
|
||||||
|
.venv
|
||||||
|
.clang-tidy
|
||||||
|
.vs/
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
build*/
|
||||||
|
out/
|
||||||
|
tmp/
|
||||||
|
|
36
examples/main-cmake-pkg/CMakeLists.txt
Normal file
36
examples/main-cmake-pkg/CMakeLists.txt
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
cmake_minimum_required(VERSION 3.12)
|
||||||
|
project("main-cmake-pkg" C CXX)
|
||||||
|
set(TARGET main-cmake-pkg)
|
||||||
|
|
||||||
|
find_package(Llama 0.0.1 REQUIRED)
|
||||||
|
|
||||||
|
# Bake common functionality in with target. Because applications
|
||||||
|
# using the relocatable Llama package should be outside of the
|
||||||
|
# source tree, main-cmake-pkg pretends the dependencies are built-in.
|
||||||
|
|
||||||
|
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
||||||
|
add_library(common OBJECT
|
||||||
|
${_common_path}/common.h
|
||||||
|
${_common_path}/common.cpp
|
||||||
|
${_common_path}/console.h
|
||||||
|
${_common_path}/console.cpp
|
||||||
|
${_common_path}/grammar-parser.h
|
||||||
|
${_common_path}/grammar-parser.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
# WARNING: because build-info.h is auto-generated, it will only
|
||||||
|
# be available after the user has built the llama.cpp sources.
|
||||||
|
#
|
||||||
|
configure_file(${_common_path}/../build-info.h
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/build-info.h
|
||||||
|
COPYONLY)
|
||||||
|
|
||||||
|
target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
|
||||||
|
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
|
||||||
|
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
37
examples/main-cmake-pkg/README.md
Normal file
37
examples/main-cmake-pkg/README.md
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
# llama.cpp/example/main-cmake-pkg
|
||||||
|
|
||||||
|
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
|
||||||
|
|
||||||
|
### Considerations
|
||||||
|
|
||||||
|
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||||
|
|
||||||
|
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||||
|
|
||||||
|
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
|
||||||
|
|
||||||
|
```cmd
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
|
||||||
|
cmake --build . --config Release
|
||||||
|
cmake --install . --prefix C:/LlamaCPP
|
||||||
|
```
|
||||||
|
|
||||||
|
### Build main-cmake-pkg
|
||||||
|
|
||||||
|
|
||||||
|
```cmd
|
||||||
|
cd ..\examples\main-cmake-pkg
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||||
|
cmake --build . --config Release
|
||||||
|
cmake --install . --prefix C:/MyLlamaApp
|
||||||
|
```
|
|
@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.model = params.model_draft;
|
params.model = params.model_draft;
|
||||||
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
@ -81,7 +82,7 @@ int main(int argc, char ** argv) {
|
||||||
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
|
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
|
||||||
|
|
||||||
// how many tokens to draft each time
|
// how many tokens to draft each time
|
||||||
const int n_draft = params.n_draft;
|
int n_draft = params.n_draft;
|
||||||
|
|
||||||
int n_predict = 0;
|
int n_predict = 0;
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
|
@ -130,6 +131,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
|
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
|
||||||
|
|
||||||
int i_dft = 0;
|
int i_dft = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
||||||
|
@ -173,6 +175,27 @@ int main(int argc, char ** argv) {
|
||||||
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
|
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
|
|
||||||
|
// heuristic for n_draft
|
||||||
|
{
|
||||||
|
const int n_draft_cur = (int) drafted.size();
|
||||||
|
const bool all_accepted = i_dft == n_draft_cur;
|
||||||
|
|
||||||
|
LOG("n_draft = %d\n", n_draft);
|
||||||
|
LOG("n_draft_cur = %d\n", n_draft_cur);
|
||||||
|
LOG("i_dft = %d\n", i_dft);
|
||||||
|
LOG("all_accepted = %d\n", all_accepted);
|
||||||
|
|
||||||
|
if (all_accepted && n_draft == n_draft_cur) {
|
||||||
|
LOG(" - max drafted tokens accepted - n_draft += 8\n");
|
||||||
|
n_draft = std::min(30, n_draft + 8);
|
||||||
|
} else if (all_accepted) {
|
||||||
|
LOG(" - partially drafted tokens accepted - no change\n");
|
||||||
|
} else {
|
||||||
|
LOG(" - drafted token rejected - n_draft -= 1\n");
|
||||||
|
n_draft = std::max(2, n_draft - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
drafted.clear();
|
drafted.clear();
|
||||||
drafted.push_back(id);
|
drafted.push_back(id);
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,8 @@
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mv $out/bin/main $out/bin/llama
|
mv $out/bin/main $out/bin/llama
|
||||||
mv $out/bin/server $out/bin/llama-server
|
mv $out/bin/server $out/bin/llama-server
|
||||||
|
mkdir -p $out/include
|
||||||
|
cp ${src}/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||||
in
|
in
|
||||||
|
|
1666
ggml-cuda.cu
1666
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
72
ggml-metal.m
72
ggml-metal.m
|
@ -63,7 +63,9 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(relu);
|
GGML_METAL_DECL_KERNEL(relu);
|
||||||
GGML_METAL_DECL_KERNEL(gelu);
|
GGML_METAL_DECL_KERNEL(gelu);
|
||||||
GGML_METAL_DECL_KERNEL(soft_max);
|
GGML_METAL_DECL_KERNEL(soft_max);
|
||||||
|
GGML_METAL_DECL_KERNEL(soft_max_4);
|
||||||
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
||||||
|
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||||
|
@ -77,6 +79,7 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(norm);
|
GGML_METAL_DECL_KERNEL(norm);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
||||||
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
||||||
|
@ -117,14 +120,17 @@ static NSString * const msl_library_source = @"see metal.metal";
|
||||||
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
metal_printf("%s: allocating\n", __func__);
|
metal_printf("%s: allocating\n", __func__);
|
||||||
|
|
||||||
// Show all the Metal device instances in the system
|
|
||||||
NSArray * devices = MTLCopyAllDevices();
|
|
||||||
id <MTLDevice> device;
|
id <MTLDevice> device;
|
||||||
NSString * s;
|
NSString * s;
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
// Show all the Metal device instances in the system
|
||||||
|
NSArray * devices = MTLCopyAllDevices();
|
||||||
for (device in devices) {
|
for (device in devices) {
|
||||||
s = [device name];
|
s = [device name];
|
||||||
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Pick and show default Metal device
|
// Pick and show default Metal device
|
||||||
device = MTLCreateSystemDefaultDevice();
|
device = MTLCreateSystemDefaultDevice();
|
||||||
|
@ -141,12 +147,20 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
|
|
||||||
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
#if 0
|
#ifdef GGML_SWIFT
|
||||||
// compile from source string and show compile log
|
// load the default.metallib file
|
||||||
{
|
{
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
|
|
||||||
ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
|
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||||
|
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
|
||||||
|
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
|
||||||
|
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
|
||||||
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||||
|
|
||||||
|
// Load the metallib file into a Metal library
|
||||||
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -207,7 +221,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(relu);
|
GGML_METAL_ADD_KERNEL(relu);
|
||||||
GGML_METAL_ADD_KERNEL(gelu);
|
GGML_METAL_ADD_KERNEL(gelu);
|
||||||
GGML_METAL_ADD_KERNEL(soft_max);
|
GGML_METAL_ADD_KERNEL(soft_max);
|
||||||
|
GGML_METAL_ADD_KERNEL(soft_max_4);
|
||||||
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
||||||
|
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||||
|
@ -221,6 +237,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(norm);
|
GGML_METAL_ADD_KERNEL(norm);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
||||||
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
||||||
|
@ -247,13 +264,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
#undef GGML_METAL_ADD_KERNEL
|
#undef GGML_METAL_ADD_KERNEL
|
||||||
}
|
}
|
||||||
|
|
||||||
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
|
||||||
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||||
} else {
|
} else {
|
||||||
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
@ -273,7 +292,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(relu);
|
GGML_METAL_DEL_KERNEL(relu);
|
||||||
GGML_METAL_DEL_KERNEL(gelu);
|
GGML_METAL_DEL_KERNEL(gelu);
|
||||||
GGML_METAL_DEL_KERNEL(soft_max);
|
GGML_METAL_DEL_KERNEL(soft_max);
|
||||||
GGML_METAL_DEL_KERNEL(diag_mask_inf);
|
GGML_METAL_DEL_KERNEL(soft_max_4);
|
||||||
|
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
||||||
|
@ -287,6 +307,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(norm);
|
GGML_METAL_DEL_KERNEL(norm);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
||||||
|
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
||||||
|
@ -454,6 +475,7 @@ bool ggml_metal_add_buffer(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
metal_printf(", (%8.2f / %8.2f)",
|
metal_printf(", (%8.2f / %8.2f)",
|
||||||
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
@ -463,6 +485,9 @@ bool ggml_metal_add_buffer(
|
||||||
} else {
|
} else {
|
||||||
metal_printf("\n");
|
metal_printf("\n");
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -750,7 +775,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
const int64_t n = ggml_nelements(dst)/4;
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
@ -762,7 +787,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
const int64_t n = ggml_nelements(dst)/4;
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
@ -782,7 +807,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
const int64_t n = ggml_nelements(dst)/4;
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
@ -796,13 +821,16 @@ void ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
const int nth = 32;
|
const int nth = 32;
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
if (ne00%4 == 0) {
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
||||||
|
} else {
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
||||||
|
}
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
@ -810,14 +838,23 @@ void ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
const int n_past = ((int32_t *)(dst->op_params))[0];
|
const int n_past = ((int32_t *)(dst->op_params))[0];
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
if (ne00%8 == 0) {
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
|
||||||
|
} else {
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
||||||
|
}
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||||
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
[encoder setBytes:&n_past length:sizeof(int) atIndex:4];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
if (ne00%8 == 0) {
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
{
|
{
|
||||||
|
@ -864,6 +901,7 @@ void ggml_metal_graph_compute(
|
||||||
} else {
|
} else {
|
||||||
int nth0 = 32;
|
int nth0 = 32;
|
||||||
int nth1 = 1;
|
int nth1 = 1;
|
||||||
|
int nrows = 1;
|
||||||
|
|
||||||
// use custom matrix x vector kernel
|
// use custom matrix x vector kernel
|
||||||
switch (src0t) {
|
switch (src0t) {
|
||||||
|
@ -873,8 +911,12 @@ void ggml_metal_graph_compute(
|
||||||
nth1 = 1;
|
nth1 = 1;
|
||||||
if (ne11 * ne12 < 4) {
|
if (ne11 * ne12 < 4) {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
||||||
|
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
||||||
|
nrows = ne11;
|
||||||
} else {
|
} else {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||||
|
nrows = 4;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -995,7 +1037,7 @@ void ggml_metal_graph_compute(
|
||||||
else if (src0t == GGML_TYPE_Q6_K) {
|
else if (src0t == GGML_TYPE_Q6_K) {
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
} else {
|
} else {
|
||||||
int64_t ny = (ne11 + 3)/4;
|
int64_t ny = (ne11 + nrows - 1)/nrows;
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
281
ggml-metal.metal
281
ggml-metal.metal
|
@ -63,18 +63,18 @@ kernel void kernel_mul_row(
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_scale(
|
kernel void kernel_scale(
|
||||||
device const float * src0,
|
device const float4 * src0,
|
||||||
device float * dst,
|
device float4 * dst,
|
||||||
constant float & scale,
|
constant float & scale,
|
||||||
uint tpig[[thread_position_in_grid]]) {
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
dst[tpig] = src0[tpig] * scale;
|
dst[tpig] = src0[tpig] * scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_silu(
|
kernel void kernel_silu(
|
||||||
device const float * src0,
|
device const float4 * src0,
|
||||||
device float * dst,
|
device float4 * dst,
|
||||||
uint tpig[[thread_position_in_grid]]) {
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
float x = src0[tpig];
|
device const float4 & x = src0[tpig];
|
||||||
dst[tpig] = x / (1.0f + exp(-x));
|
dst[tpig] = x / (1.0f + exp(-x));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -89,10 +89,10 @@ constant float GELU_COEF_A = 0.044715f;
|
||||||
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||||
|
|
||||||
kernel void kernel_gelu(
|
kernel void kernel_gelu(
|
||||||
device const float * src0,
|
device const float4 * src0,
|
||||||
device float * dst,
|
device float4 * dst,
|
||||||
uint tpig[[thread_position_in_grid]]) {
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
float x = src0[tpig];
|
device const float4 & x = src0[tpig];
|
||||||
|
|
||||||
// BEWARE !!!
|
// BEWARE !!!
|
||||||
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
|
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
|
||||||
|
@ -107,7 +107,6 @@ kernel void kernel_soft_max(
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
threadgroup float * buf [[threadgroup(0)]],
|
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
@ -119,64 +118,70 @@ kernel void kernel_soft_max(
|
||||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
buf[tpitg[0]] = -INFINITY;
|
float lmax = psrc0[tpitg[0]];
|
||||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
|
||||||
buf[tpitg[0]] = MAX(buf[tpitg[0]], psrc0[i00]);
|
lmax = MAX(lmax, psrc0[i00]);
|
||||||
}
|
}
|
||||||
|
const float max = simd_max(lmax);
|
||||||
// reduce
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
for (uint i = ntg[0]/2; i > 0; i /= 2) {
|
|
||||||
if (tpitg[0] < i) {
|
|
||||||
buf[tpitg[0]] = MAX(buf[tpitg[0]], buf[tpitg[0] + i]);
|
|
||||||
}
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
}
|
|
||||||
|
|
||||||
//// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
|
|
||||||
// the loop, and when that is done, buf[0] has the correct (synchronized) value
|
|
||||||
//if (tpitg[0] == 0) {
|
|
||||||
// buf[0] = buf[0];
|
|
||||||
//}
|
|
||||||
|
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
|
|
||||||
const float max = buf[0];
|
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
buf[tpitg[0]] = 0.0f;
|
float lsum = 0.0f;
|
||||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
||||||
const float exp_psrc0 = exp(psrc0[i00] - max);
|
const float exp_psrc0 = exp(psrc0[i00] - max);
|
||||||
buf[tpitg[0]] += exp_psrc0;
|
lsum += exp_psrc0;
|
||||||
// Remember the result of exp here. exp is expensive, so we really do not
|
// Remember the result of exp here. exp is expensive, so we really do not
|
||||||
// whish to compute it twice.
|
// whish to compute it twice.
|
||||||
pdst[i00] = exp_psrc0;
|
pdst[i00] = exp_psrc0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// reduce
|
const float sum = simd_sum(lsum);
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
for (uint i = ntg[0]/2; i > 0; i /= 2) {
|
|
||||||
if (tpitg[0] < i) {
|
|
||||||
buf[tpitg[0]] += buf[tpitg[0] + i];
|
|
||||||
}
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
}
|
|
||||||
|
|
||||||
// broadcast - not needed, see above
|
|
||||||
//// broadcast
|
|
||||||
//if (tpitg[0] == 0) {
|
|
||||||
// buf[0] = buf[0];
|
|
||||||
//}
|
|
||||||
|
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
|
|
||||||
const float sum = buf[0];
|
|
||||||
|
|
||||||
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
|
||||||
pdst[i00] /= sum;
|
pdst[i00] /= sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_soft_max_4(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int64_t i03 = tgpig[2];
|
||||||
|
const int64_t i02 = tgpig[1];
|
||||||
|
const int64_t i01 = tgpig[0];
|
||||||
|
|
||||||
|
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
|
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
|
|
||||||
|
// parallel max
|
||||||
|
float4 lmax4 = psrc4[tpitg[0]];
|
||||||
|
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
||||||
|
lmax4 = fmax(lmax4, psrc4[i00]);
|
||||||
|
}
|
||||||
|
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||||
|
|
||||||
|
const float max = simd_max(lmax);
|
||||||
|
|
||||||
|
// parallel sum
|
||||||
|
float4 lsum4 = 0.0f;
|
||||||
|
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
||||||
|
const float4 exp_psrc4 = exp(psrc4[i00] - max);
|
||||||
|
lsum4 += exp_psrc4;
|
||||||
|
pdst4[i00] = exp_psrc4;
|
||||||
|
}
|
||||||
|
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
||||||
|
|
||||||
|
const float sum = simd_sum(lsum);
|
||||||
|
|
||||||
|
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
|
||||||
|
pdst4[i00] /= sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_diag_mask_inf(
|
kernel void kernel_diag_mask_inf(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -192,6 +197,33 @@ kernel void kernel_diag_mask_inf(
|
||||||
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
|
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
|
||||||
} else {
|
} else {
|
||||||
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
|
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_diag_mask_inf_8(
|
||||||
|
device const float4 * src0,
|
||||||
|
device float4 * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int & n_past,
|
||||||
|
uint3 tpig[[thread_position_in_grid]]) {
|
||||||
|
|
||||||
|
const int64_t i = 2*tpig[0];
|
||||||
|
|
||||||
|
dst[i+0] = src0[i+0];
|
||||||
|
dst[i+1] = src0[i+1];
|
||||||
|
int64_t i4 = 4*i;
|
||||||
|
const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
|
||||||
|
const int64_t i01 = i4/(ne00); i4 -= i01*ne00;
|
||||||
|
const int64_t i00 = i4;
|
||||||
|
for (int k = 3; k >= 0; --k) {
|
||||||
|
if (i00 + 4 + k <= n_past + i01) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dst[i+1][k] = -INFINITY;
|
||||||
|
if (i00 + k > n_past + i01) {
|
||||||
|
dst[i][k] = -INFINITY;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -616,6 +648,49 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Assumes row size (ne00) is a multiple of 4
|
||||||
|
kernel void kernel_mul_mat_f16_f32_l4(
|
||||||
|
device const char * src0,
|
||||||
|
device const char * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant int64_t & ne10,
|
||||||
|
constant int64_t & ne11,
|
||||||
|
constant int64_t & ne12,
|
||||||
|
constant uint64_t & nb10,
|
||||||
|
constant uint64_t & nb11,
|
||||||
|
constant uint64_t & nb12,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||||
|
|
||||||
|
const int nrows = ne11;
|
||||||
|
const int64_t r0 = tgpig.x;
|
||||||
|
const int64_t im = tgpig.z;
|
||||||
|
|
||||||
|
device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||||
|
|
||||||
|
for (int r1 = 0; r1 < nrows; ++r1) {
|
||||||
|
device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = tiisg; i < ne00/4; i += 32) {
|
||||||
|
for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
|
||||||
|
}
|
||||||
|
|
||||||
|
float all_sum = simd_sum(sumf);
|
||||||
|
if (tiisg == 0) {
|
||||||
|
dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_alibi_f32(
|
kernel void kernel_alibi_f32(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1800,29 +1875,34 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
||||||
|
|
||||||
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
||||||
const half d = il ? (xb->d / 16.h) : xb->d;
|
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||||
const half m = il ? ( -8.h * 16.h) : -8.h;
|
const float d2 = d1 / 256.f;
|
||||||
|
const float md = -8.h * xb->d;
|
||||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||||
const ushort mask1 = il ? 0xF000 : 0x0F00;
|
const ushort mask1 = mask0 << 8;
|
||||||
|
|
||||||
for (int i=0;i<8;i++) {
|
for (int i=0;i<8;i++) {
|
||||||
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) + m) * d;
|
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
||||||
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d;
|
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
||||||
|
|
||||||
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
||||||
const half d = il ? (xb->d / 16.h) : xb->d;
|
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||||
const half m = xb->m;
|
const float d2 = d1 / 256.f;
|
||||||
|
const float m = xb->m;
|
||||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||||
const ushort mask1 = il ? 0xF000 : 0x0F00;
|
const ushort mask1 = mask0 << 8;
|
||||||
|
|
||||||
for (int i=0;i<8;i++) {
|
for (int i=0;i<8;i++) {
|
||||||
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) * d) + m;
|
reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
|
||||||
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m;
|
reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1858,7 +1938,7 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
|
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
|
||||||
const float d_all = (float)(xb->d);
|
const half d_all = xb->d;
|
||||||
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
||||||
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
||||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||||
|
@ -1871,17 +1951,20 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||||
((il/4)>0 ? 12 : 3);
|
((il/4)>0 ? 12 : 3);
|
||||||
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
|
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
|
||||||
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
|
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
|
||||||
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : \
|
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
|
||||||
(scale_2&kmask2) | ((scale_1&kmask1) << 4);
|
: (scale_2&kmask2) | ((scale_1&kmask1) << 4);
|
||||||
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
|
half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h);
|
||||||
|
const half ml = 4.h * dl;
|
||||||
|
|
||||||
il = (il/2)%4;
|
il = (il/2) & 3;
|
||||||
float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||||
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||||
|
dl *= coef;
|
||||||
|
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i] & m) ? 0 : 4.f/coef));
|
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
||||||
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
||||||
|
@ -1895,31 +1978,37 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
||||||
|
return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
|
||||||
|
: uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
|
||||||
|
}
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
||||||
device const uint8_t * q = xb->qs;
|
device const uchar * q = xb->qs;
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
const float d = (float)(xb->d);
|
|
||||||
const float min = (float)(xb->dmin);
|
|
||||||
short is = (il/4) * 2;
|
short is = (il/4) * 2;
|
||||||
q = q + (il/4) * 32 + 16 * (il&1);
|
q = q + (il/4) * 32 + 16 * (il&1);
|
||||||
il = il%4;
|
il = il & 3;
|
||||||
const uchar4 sc = get_scale_min_k4(is, xb->scales);
|
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
||||||
const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h;
|
const half d = il < 2 ? xb->d : xb->d / 16.h;
|
||||||
const float ml = il<2 ? min * sc[1] : min * sc[3];
|
const half min = xb->dmin;
|
||||||
|
const half dl = d * sc[0];
|
||||||
|
const half ml = min * sc[1];
|
||||||
#else
|
#else
|
||||||
q = q + 16 * (il&1);
|
q = q + 16 * (il&1);
|
||||||
device const uint8_t * s = xb->scales;
|
device const uint8_t * s = xb->scales;
|
||||||
device const half2 * dh = (device const half2 *)xb->d;
|
device const half2 * dh = (device const half2 *)xb->d;
|
||||||
const float2 d = (float2)dh[0];
|
const float2 d = (float2)dh[0];
|
||||||
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
|
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
|
||||||
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1 ]* (s[1]>>4);
|
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
|
||||||
#endif
|
#endif
|
||||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
|
@ -1928,19 +2017,19 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
||||||
device const uint8_t * qh = xb->qh;
|
device const uint8_t * qh = xb->qh;
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
const float d = (float)(xb->d);
|
|
||||||
const float min = (float)(xb->dmin);
|
|
||||||
short is = (il/4) * 2;
|
short is = (il/4) * 2;
|
||||||
q = q + 32 * (il/4) + 16 * (il&1);
|
q = q + 32 * (il/4) + 16 * (il&1);
|
||||||
qh = qh + 16 * (il&1);
|
qh = qh + 16 * (il&1);
|
||||||
uint8_t ul = 1 << (il/2);
|
uint8_t ul = 1 << (il/2);
|
||||||
il = il%4;
|
il = il & 3;
|
||||||
const uchar4 sc = get_scale_min_k4(is, xb->scales);
|
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
||||||
const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h;
|
const half d = il < 2 ? xb->d : xb->d / 16.h;
|
||||||
const float ml = il<2 ? min * sc[1] : min * sc[3];
|
const half min = xb->dmin;
|
||||||
|
const half dl = d * sc[0];
|
||||||
|
const half ml = min * sc[1];
|
||||||
|
|
||||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||||
const float qh_val = il<2 ? 16.f : 256.f;
|
const half qh_val = il<2 ? 16.h : 256.h;
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
||||||
}
|
}
|
||||||
|
@ -1959,7 +2048,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
|
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
|
||||||
const float d_all = (float)(xb->d);
|
const half d_all = xb->d;
|
||||||
device const uint8_t * ql = (device const uint8_t *)xb->ql;
|
device const uint8_t * ql = (device const uint8_t *)xb->ql;
|
||||||
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
||||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||||
|
@ -1967,19 +2056,21 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||||
qh = qh + 32*(il/8) + 16*(il&1);
|
qh = qh + 32*(il/8) + 16*(il&1);
|
||||||
float sc = scales[(il%2) + 2 * ((il/2))];
|
half sc = scales[(il%2) + 2 * ((il/2))];
|
||||||
il = (il/2)%4;
|
il = (il/2) & 3;
|
||||||
#else
|
#else
|
||||||
ql = ql + 16 * (il&1);
|
ql = ql + 16 * (il&1);
|
||||||
float sc = scales[il];
|
half sc = scales[il];
|
||||||
#endif
|
#endif
|
||||||
|
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||||
|
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
||||||
|
const half coef = il>1 ? 1.f/16.h : 1.h;
|
||||||
|
const half ml = d_all * sc * 32.h;
|
||||||
|
const half dl = d_all * sc * coef;
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
|
||||||
uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
: ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
|
||||||
const float coef = il>1 ? 1.f/16.f : 1.f;
|
reg[i/4][i%4] = dl * q - ml;
|
||||||
float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \
|
|
||||||
((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef;
|
|
||||||
reg[i/4][i%4] = d_all * sc * q * coef;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
2
ggml.c
2
ggml.c
|
@ -283,7 +283,7 @@ typedef double ggml_float;
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
// on Arm, we use __fp16
|
// on Arm, we use __fp16
|
||||||
// on x86, we use uint16_t
|
// on x86, we use uint16_t
|
||||||
#ifdef __ARM_NEON
|
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
|
|
||||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||||
//
|
//
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -270,7 +270,7 @@ extern "C" {
|
||||||
|
|
||||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||||
typedef half ggml_fp16_t;
|
typedef half ggml_fp16_t;
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
typedef __fp16 ggml_fp16_t;
|
typedef __fp16 ggml_fp16_t;
|
||||||
#else
|
#else
|
||||||
typedef uint16_t ggml_fp16_t;
|
typedef uint16_t ggml_fp16_t;
|
||||||
|
|
|
@ -79,6 +79,7 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
LLAMA : int = auto()
|
LLAMA : int = auto()
|
||||||
FALCON : int = auto()
|
FALCON : int = auto()
|
||||||
|
BAICHUAN:int = auto()
|
||||||
GPT2 : int = auto()
|
GPT2 : int = auto()
|
||||||
GPTJ : int = auto()
|
GPTJ : int = auto()
|
||||||
GPTNEOX: int = auto()
|
GPTNEOX: int = auto()
|
||||||
|
@ -108,6 +109,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.LLAMA: "llama",
|
MODEL_ARCH.LLAMA: "llama",
|
||||||
MODEL_ARCH.FALCON: "falcon",
|
MODEL_ARCH.FALCON: "falcon",
|
||||||
|
MODEL_ARCH.BAICHUAN:"baichuan",
|
||||||
MODEL_ARCH.GPT2: "gpt2",
|
MODEL_ARCH.GPT2: "gpt2",
|
||||||
MODEL_ARCH.GPTJ: "gptj",
|
MODEL_ARCH.GPTJ: "gptj",
|
||||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||||
|
@ -153,6 +155,22 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
},
|
},
|
||||||
|
MODEL_ARCH.BAICHUAN: {
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
|
},
|
||||||
MODEL_ARCH.GPT2: {
|
MODEL_ARCH.GPT2: {
|
||||||
# TODO
|
# TODO
|
||||||
},
|
},
|
||||||
|
@ -165,6 +183,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BAICHUAN: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -187,7 +209,7 @@ class TensorNameMap:
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||||
"output", # llama-pth
|
"output", # llama-pth
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -195,7 +217,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
"transformer.ln_f", # gpt2 falcon
|
"transformer.ln_f", # gpt2 falcon
|
||||||
"model.norm", # llama-hf
|
"model.norm", # llama-hf baichuan
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -311,6 +333,7 @@ class TensorNameMap:
|
||||||
tensor_name = tensor_names.get(tensor)
|
tensor_name = tensor_names.get(tensor)
|
||||||
if tensor_name is None:
|
if tensor_name is None:
|
||||||
continue
|
continue
|
||||||
|
mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
mapping[key] = (tensor, tensor_name)
|
mapping[key] = (tensor, tensor_name)
|
||||||
for bid in range(n_blocks):
|
for bid in range(n_blocks):
|
||||||
|
@ -319,11 +342,12 @@ class TensorNameMap:
|
||||||
if tensor_name is None:
|
if tensor_name is None:
|
||||||
continue
|
continue
|
||||||
tensor_name = tensor_name.format(bid = bid)
|
tensor_name = tensor_name.format(bid = bid)
|
||||||
|
mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
key = key.format(bid = bid)
|
key = key.format(bid = bid)
|
||||||
mapping[key] = (tensor, tensor_name)
|
mapping[key] = (tensor, tensor_name)
|
||||||
|
|
||||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
|
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||||
result = self.mapping.get(key)
|
result = self.mapping.get(key)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result
|
return result
|
||||||
|
@ -334,13 +358,13 @@ class TensorNameMap:
|
||||||
return (result[0], result[1] + suffix)
|
return (result[0], result[1] + suffix)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
|
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
|
||||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||||
if result is None:
|
if result is None:
|
||||||
return None
|
return None
|
||||||
return result[1]
|
return result[1]
|
||||||
|
|
||||||
def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
|
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
||||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||||
if result is None:
|
if result is None:
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.3.2"
|
version = "0.3.3"
|
||||||
description = "Write ML models in GGUF for GGML"
|
description = "Write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
|
@ -2609,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
memcpy(utmp, x[i].scales, 12);
|
memcpy(utmp, x[i].scales, 12);
|
||||||
|
|
||||||
const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)};
|
uint32x2_t mins8 = { 0 };
|
||||||
|
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
|
||||||
|
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
|
||||||
|
|
||||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||||
utmp[0] &= kmask1;
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
|
|
469
llama.cpp
469
llama.cpp
|
@ -155,6 +155,7 @@ static std::string format(const char * fmt, ...) {
|
||||||
enum llm_arch {
|
enum llm_arch {
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
|
LLM_ARCH_BAICHUAN,
|
||||||
LLM_ARCH_GPT2,
|
LLM_ARCH_GPT2,
|
||||||
LLM_ARCH_GPTJ,
|
LLM_ARCH_GPTJ,
|
||||||
LLM_ARCH_GPTNEOX,
|
LLM_ARCH_GPTNEOX,
|
||||||
|
@ -169,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_GPTJ, "gptj" },
|
{ LLM_ARCH_GPTJ, "gptj" },
|
||||||
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
||||||
{ LLM_ARCH_MPT, "mpt" },
|
{ LLM_ARCH_MPT, "mpt" },
|
||||||
|
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llm_kv {
|
enum llm_kv {
|
||||||
|
@ -309,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_BAICHUAN,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
{
|
{
|
||||||
|
@ -1683,6 +1704,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BAICHUAN:
|
||||||
|
{
|
||||||
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1923,7 +1953,6 @@ static void llm_load_tensors(
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
const auto tn = LLM_TN(model.arch);
|
const auto tn = LLM_TN(model.arch);
|
||||||
|
|
||||||
switch (model.arch) {
|
switch (model.arch) {
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
{
|
{
|
||||||
|
@ -1966,6 +1995,72 @@ static void llm_load_tensors(
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||||
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||||
|
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights +=
|
||||||
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||||
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
||||||
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_BAICHUAN:
|
||||||
|
{
|
||||||
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
{
|
||||||
|
ggml_backend backend_norm;
|
||||||
|
ggml_backend backend_output;
|
||||||
|
|
||||||
|
if (n_gpu_layers > int(n_layer)) {
|
||||||
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||||
|
// on Windows however this is detrimental unless everything is on the GPU
|
||||||
|
#ifndef _WIN32
|
||||||
|
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#else
|
||||||
|
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||||
|
} else {
|
||||||
|
backend_norm = GGML_BACKEND_CPU;
|
||||||
|
backend_output = GGML_BACKEND_CPU;
|
||||||
|
}
|
||||||
|
|
||||||
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||||
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
|
}
|
||||||
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
vram_weights += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_ff = hparams.n_ff;
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
@ -2542,6 +2637,367 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static struct ggml_cgraph * llm_build_baichaun(
|
||||||
|
llama_context & lctx,
|
||||||
|
const llama_token * tokens,
|
||||||
|
const float * embd,
|
||||||
|
int n_tokens,
|
||||||
|
int n_past) {
|
||||||
|
|
||||||
|
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
||||||
|
|
||||||
|
const int N = n_tokens;
|
||||||
|
|
||||||
|
const auto & model = lctx.model;
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
|
GGML_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
const int64_t n_layer = hparams.n_layer;
|
||||||
|
const int64_t n_ctx = hparams.n_ctx;
|
||||||
|
const int64_t n_head = hparams.n_head;
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv;
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head();
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
const float freq_base = hparams.rope_freq_base;
|
||||||
|
const float freq_scale = hparams.rope_freq_scale;
|
||||||
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
||||||
|
|
||||||
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ buf_compute.size,
|
||||||
|
/*.mem_buffer =*/ buf_compute.data,
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
|
};
|
||||||
|
|
||||||
|
params.no_alloc = true;
|
||||||
|
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
if (tokens) {
|
||||||
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
|
|
||||||
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||||
|
}
|
||||||
|
ggml_set_name(inp_tokens, "inp_tokens");
|
||||||
|
|
||||||
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||||
|
} else {
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
GGML_ASSERT(false && "not implemented");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
||||||
|
|
||||||
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
(void) i_gpu_start;
|
||||||
|
|
||||||
|
// offload functions set the tensor output backend to GPU
|
||||||
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
||||||
|
//
|
||||||
|
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
||||||
|
// in that case ggml_cuda_assign_buffers has no effect
|
||||||
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||||
|
offload_func_t offload_func_kq = llama_nop;
|
||||||
|
offload_func_t offload_func_v = llama_nop;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_gpu_layers > n_layer) {
|
||||||
|
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 1) {
|
||||||
|
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
|
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||||
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
|
}
|
||||||
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
||||||
|
|
||||||
|
offload_func_t offload_func = llama_nop;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (il >= i_gpu_start) {
|
||||||
|
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "rms_norm_0");
|
||||||
|
|
||||||
|
// cur = cur*attn_norm(broadcasted)
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "attention_norm_0");
|
||||||
|
}
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
offload_func_kq(tmpk);
|
||||||
|
ggml_set_name(tmpk, "tmpk");
|
||||||
|
|
||||||
|
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
offload_func_kq(tmpq);
|
||||||
|
ggml_set_name(tmpq, "tmpq");
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur;
|
||||||
|
struct ggml_tensor * Qcur;
|
||||||
|
switch (model.type) {
|
||||||
|
case MODEL_7B:
|
||||||
|
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
||||||
|
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
||||||
|
break;
|
||||||
|
case MODEL_13B:
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
offload_func_kq(Kcur);
|
||||||
|
ggml_set_name(Kcur, "Kcur");
|
||||||
|
|
||||||
|
offload_func_kq(Qcur);
|
||||||
|
ggml_set_name(Qcur, "Qcur");
|
||||||
|
|
||||||
|
// store key and value to memory
|
||||||
|
{
|
||||||
|
// compute the transposed [N, n_embd] V matrix
|
||||||
|
|
||||||
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
offload_func_v(tmpv);
|
||||||
|
ggml_set_name(tmpv, "tmpv");
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
||||||
|
offload_func_v(Vcur);
|
||||||
|
ggml_set_name(Vcur, "Vcur");
|
||||||
|
|
||||||
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
||||||
|
offload_func_kq(k);
|
||||||
|
ggml_set_name(k, "k");
|
||||||
|
|
||||||
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
||||||
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
||||||
|
offload_func_v(v);
|
||||||
|
ggml_set_name(v, "v");
|
||||||
|
|
||||||
|
// important: storing RoPE-ed version of K in the KV cache!
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||||
|
offload_func_kq(Q);
|
||||||
|
ggml_set_name(Q, "Q");
|
||||||
|
|
||||||
|
struct ggml_tensor * K =
|
||||||
|
ggml_view_3d(ctx0, kv_self.k,
|
||||||
|
n_embd_head, n_past + N, n_head_kv,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_head,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
||||||
|
offload_func_kq(K);
|
||||||
|
ggml_set_name(K, "K");
|
||||||
|
|
||||||
|
// K * Q
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
offload_func_kq(KQ);
|
||||||
|
ggml_set_name(KQ, "KQ");
|
||||||
|
|
||||||
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||||
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||||
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
|
offload_func_kq(KQ_scaled);
|
||||||
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||||
|
|
||||||
|
struct ggml_tensor * KQ_masked;
|
||||||
|
struct ggml_tensor * KQ_scaled_alibi;
|
||||||
|
|
||||||
|
switch (model.type) {
|
||||||
|
case MODEL_7B:
|
||||||
|
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
|
break;
|
||||||
|
case MODEL_13B:
|
||||||
|
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
|
||||||
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
||||||
|
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
|
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
|
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
||||||
|
// offload_func_kq(KQ_masked);
|
||||||
|
// ggml_set_name(KQ_masked, "KQ_masked");
|
||||||
|
|
||||||
|
// KQ = soft_max(KQ_masked)
|
||||||
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
|
offload_func_v(KQ_soft_max);
|
||||||
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
|
// split cached V into n_head heads
|
||||||
|
struct ggml_tensor * V =
|
||||||
|
ggml_view_3d(ctx0, kv_self.v,
|
||||||
|
n_past + N, n_embd_head, n_head_kv,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
||||||
|
offload_func_v(V);
|
||||||
|
ggml_set_name(V, "V");
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
offload_func_v(KQV);
|
||||||
|
ggml_set_name(KQV, "KQV");
|
||||||
|
#else
|
||||||
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
||||||
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
||||||
|
// is there a better way?
|
||||||
|
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
offload_func_v(KQV_merged);
|
||||||
|
ggml_set_name(KQV_merged, "KQV_merged");
|
||||||
|
|
||||||
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||||
|
cur = ggml_cpy(ctx0,
|
||||||
|
KQV_merged,
|
||||||
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
offload_func_v(cur);
|
||||||
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||||
|
|
||||||
|
// projection (no bias)
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].wo,
|
||||||
|
cur);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "result_wo");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
||||||
|
offload_func(inpFF);
|
||||||
|
ggml_set_name(inpFF, "inpFF");
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
{
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "rms_norm_1");
|
||||||
|
|
||||||
|
// cur = cur*ffn_norm(broadcasted)
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "ffn_norm");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].w3,
|
||||||
|
cur);
|
||||||
|
offload_func(tmp);
|
||||||
|
ggml_set_name(tmp, "result_w3");
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].w1,
|
||||||
|
cur);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "result_w1");
|
||||||
|
|
||||||
|
// SILU activation
|
||||||
|
cur = ggml_silu(ctx0, cur);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "silu");
|
||||||
|
|
||||||
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "silu_x_result_w3");
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].w2,
|
||||||
|
cur);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "result_w2");
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, inpFF);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "inpFF_+_result_w2");
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
||||||
|
offload_func_nr(cur);
|
||||||
|
ggml_set_name(cur, "rms_norm_2");
|
||||||
|
|
||||||
|
// cur = cur*norm(broadcasted)
|
||||||
|
cur = ggml_mul(ctx0, cur, model.output_norm);
|
||||||
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
||||||
|
ggml_set_name(cur, "result_norm");
|
||||||
|
}
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
ggml_set_name(cur, "result_output");
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_cgraph * llm_build_falcon(
|
static struct ggml_cgraph * llm_build_falcon(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
|
@ -2864,6 +3320,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BAICHUAN:
|
||||||
|
{
|
||||||
|
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
|
||||||
|
} break;
|
||||||
case LLM_ARCH_FALCON:
|
case LLM_ARCH_FALCON:
|
||||||
{
|
{
|
||||||
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
||||||
|
@ -3121,10 +3581,9 @@ struct llm_tokenizer_spm {
|
||||||
while (offs < text.size()) {
|
while (offs < text.size()) {
|
||||||
llm_symbol sym;
|
llm_symbol sym;
|
||||||
size_t len = utf8_len(text[offs]);
|
size_t len = utf8_len(text[offs]);
|
||||||
GGML_ASSERT(offs + len <= text.size());
|
|
||||||
sym.text = text.c_str() + offs;
|
sym.text = text.c_str() + offs;
|
||||||
sym.n = len;
|
sym.n = std::min(len, text.size() - offs);
|
||||||
offs += len;
|
offs += sym.n;
|
||||||
sym.prev = index - 1;
|
sym.prev = index - 1;
|
||||||
sym.next = offs == text.size() ? -1 : index + 1;
|
sym.next = offs == text.size() ? -1 : index + 1;
|
||||||
index++;
|
index++;
|
||||||
|
@ -6218,7 +6677,7 @@ int llama_tokenize_with_model(
|
||||||
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
||||||
|
|
||||||
if (n_max_tokens < (int) res.size()) {
|
if (n_max_tokens < (int) res.size()) {
|
||||||
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||||
return -((int) res.size());
|
return -((int) res.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
4
prompts/chat-with-baichuan.txt
Normal file
4
prompts/chat-with-baichuan.txt
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
以下内容为人类用户与与一位智能助手的对话。
|
||||||
|
|
||||||
|
用户:你好!
|
||||||
|
助手:
|
69
scripts/LlamaConfig.cmake.in
Normal file
69
scripts/LlamaConfig.cmake.in
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
||||||
|
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||||
|
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||||
|
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
|
set(LLAMA_BLAS @LLAMA_BLAS@)
|
||||||
|
set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
|
||||||
|
set(LLAMA_METAL @LLAMA_METAL@)
|
||||||
|
set(LLAMA_MPI @LLAMA_MPI@)
|
||||||
|
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
||||||
|
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
||||||
|
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
||||||
|
|
||||||
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
|
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||||
|
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||||
|
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||||
|
|
||||||
|
# Ensure transient dependencies satisfied
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
if (APPLE AND LLAMA_ACCELERATE)
|
||||||
|
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_BLAS)
|
||||||
|
find_package(BLAS REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUBLAS)
|
||||||
|
find_package(CUDAToolkit REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_METAL)
|
||||||
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_MPI)
|
||||||
|
find_package(MPI REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CLBLAST)
|
||||||
|
find_package(CLBlast REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_HIPBLAS)
|
||||||
|
find_package(hip REQUIRED)
|
||||||
|
find_package(hipblas REQUIRED)
|
||||||
|
find_package(rocblas REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_library(llama_LIBRARY llama
|
||||||
|
REQUIRED
|
||||||
|
HINTS ${LLAMA_LIB_DIR})
|
||||||
|
|
||||||
|
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
||||||
|
add_library(llama UNKNOWN IMPORTED)
|
||||||
|
set_target_properties(llama
|
||||||
|
PROPERTIES
|
||||||
|
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||||
|
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
||||||
|
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||||
|
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||||
|
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||||
|
POSITION_INDEPENDENT_CODE ON )
|
||||||
|
|
||||||
|
check_required_components(Llama)
|
|
@ -29,9 +29,8 @@ llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||||
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_build_executable(test-tokenizer-1.cpp)
|
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||||
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
||||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "console.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -89,6 +90,12 @@ int main(int argc, char **argv) {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
// We need this for unicode console support
|
||||||
|
console::init(false, false);
|
||||||
|
atexit([]() { console::cleanup(); });
|
||||||
|
#endif
|
||||||
|
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
|
||||||
for (const auto & test_kv : k_tests()) {
|
for (const auto & test_kv : k_tests()) {
|
||||||
|
|
127
tests/test-tokenizer-1-llama.cpp
Normal file
127
tests/test-tokenizer-1-llama.cpp
Normal file
|
@ -0,0 +1,127 @@
|
||||||
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "console.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <string>
|
||||||
|
#include <codecvt>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <locale>
|
||||||
|
|
||||||
|
typedef int codepoint;
|
||||||
|
|
||||||
|
std::string codepoint_to_utf8(codepoint cp) {
|
||||||
|
std::string result;
|
||||||
|
if (0x00 <= cp && cp <= 0x7f) {
|
||||||
|
result.push_back(cp);
|
||||||
|
} else if (0x80 <= cp && cp <= 0x7ff) {
|
||||||
|
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
} else if (0x800 <= cp && cp <= 0xffff) {
|
||||||
|
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||||
|
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
} else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||||
|
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||||
|
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||||
|
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument("invalid codepoint");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 2) {
|
||||||
|
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string fname = argv[1];
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
|
||||||
|
llama_backend_init(false);
|
||||||
|
|
||||||
|
// load the vocab
|
||||||
|
{
|
||||||
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
|
lparams.vocab_only = true;
|
||||||
|
|
||||||
|
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM);
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
// We need this for unicode console support
|
||||||
|
console::init(false, false);
|
||||||
|
atexit([]() { console::cleanup(); });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
|
std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
|
if (check != str) {
|
||||||
|
fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
|
||||||
|
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
||||||
|
if(i != 3)
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
|
||||||
|
if (cp < 0xd800 || cp > 0xdfff) {
|
||||||
|
std::string str = codepoint_to_utf8(cp);
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
|
if (str != check) {
|
||||||
|
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
|
||||||
|
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
|
if(cp != 0 && cp != 9601)
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||||
|
std::string str = codepoint_to_utf8(cp);
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
|
if (str != check) {
|
||||||
|
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
|
||||||
|
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -1,108 +0,0 @@
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <string>
|
|
||||||
#include <codecvt>
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <locale>
|
|
||||||
|
|
||||||
static std::string escape_whitespace(const std::string& text) {
|
|
||||||
std::string result = "\xe2\x96\x81";
|
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
|
||||||
if (text[offs] == ' ') {
|
|
||||||
result += "\xe2\x96\x81";
|
|
||||||
} else {
|
|
||||||
result += text[offs];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
if (argc < 2) {
|
|
||||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string fname = argv[1];
|
|
||||||
|
|
||||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
llama_backend_init(false);
|
|
||||||
|
|
||||||
// load the vocab
|
|
||||||
{
|
|
||||||
auto lparams = llama_context_default_params();
|
|
||||||
|
|
||||||
lparams.vocab_only = true;
|
|
||||||
|
|
||||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
|
||||||
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx = llama_new_context_with_model(model, lparams);
|
|
||||||
|
|
||||||
if (ctx == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
|
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_vocab; ++i) {
|
|
||||||
std::string forward = llama_token_to_piece(ctx, i);
|
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
|
||||||
if (tokens.size() == 1) {
|
|
||||||
if (i != tokens[0]) {
|
|
||||||
std::string backward = llama_token_to_piece(ctx, tokens[0]);
|
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
|
||||||
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
|
|
||||||
for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
|
|
||||||
std::u16string u16str(1, ch);
|
|
||||||
std::string str = u16converter.to_bytes(u16str);
|
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
|
|
||||||
if (tokens.size() == 1) {
|
|
||||||
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
|
||||||
__func__, str.c_str(), tokens[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
|
|
||||||
for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
|
|
||||||
std::u32string u32str(1, ch);
|
|
||||||
std::string str = u32converter.to_bytes(u32str);
|
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
|
|
||||||
if (tokens.size() == 1) {
|
|
||||||
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
llama_free_model(model);
|
|
||||||
llama_free(ctx);
|
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
Loading…
Add table
Add a link
Reference in a new issue