Merge branch 'master' into master
This commit is contained in:
commit
e004116de9
34 changed files with 2029 additions and 470 deletions
22
.devops/cloud-v-pipeline
Normal file
22
.devops/cloud-v-pipeline
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
|
||||||
|
stage('Cleanup'){
|
||||||
|
cleanWs() // Cleaning previous CI build in workspace
|
||||||
|
}
|
||||||
|
stage('checkout repo'){
|
||||||
|
retry(5){ // Retry if the cloning fails due to some reason
|
||||||
|
checkout scm // Clone the repo on Runner
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Compiling llama.cpp'){
|
||||||
|
sh'''#!/bin/bash
|
||||||
|
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
|
||||||
|
'''
|
||||||
|
}
|
||||||
|
stage('Running llama.cpp'){
|
||||||
|
sh'''#!/bin/bash
|
||||||
|
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||||
|
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||||
|
cat llama_log.txt # Printing results
|
||||||
|
'''
|
||||||
|
}
|
||||||
|
}
|
75
.github/workflows/build.yml
vendored
75
.github/workflows/build.yml
vendored
|
@ -27,7 +27,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -52,7 +52,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -87,7 +87,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -121,7 +121,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -149,7 +149,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -174,7 +174,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
|
@ -280,7 +280,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Download OpenCL SDK
|
- name: Download OpenCL SDK
|
||||||
id: get_opencl
|
id: get_opencl
|
||||||
|
@ -390,20 +390,19 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
cuda: ['12.1.0', '11.7.1']
|
cuda: ['12.2.0', '11.7.1']
|
||||||
build: ['cublas']
|
build: ['cublas']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- uses: Jimver/cuda-toolkit@v0.2.10
|
- uses: Jimver/cuda-toolkit@v0.2.11
|
||||||
id: cuda-toolkit
|
id: cuda-toolkit
|
||||||
with:
|
with:
|
||||||
cuda: ${{ matrix.cuda }}
|
cuda: ${{ matrix.cuda }}
|
||||||
# TODO(green-sky): _dev seems to fail, and non dev are not enought
|
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||||
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
|
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
@ -440,27 +439,11 @@ jobs:
|
||||||
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
- name: Copy and pack Cuda runtime
|
- name: Copy and pack Cuda runtime
|
||||||
if: ${{ matrix.cuda == '12.1.0' }}
|
|
||||||
# TODO(green-sky): paths are cuda 12 specific
|
|
||||||
run: |
|
run: |
|
||||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||||
mkdir '.\build\bin\cudart\'
|
$dst='.\build\bin\cudart\'
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
|
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
|
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
|
|
||||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
|
||||||
|
|
||||||
- name: Copy and pack Cuda runtime
|
|
||||||
if: ${{ matrix.cuda == '11.7.1' }}
|
|
||||||
# TODO(green-sky): paths are cuda 11 specific
|
|
||||||
run: |
|
|
||||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
|
||||||
mkdir '.\build\bin\cudart\'
|
|
||||||
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
|
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
|
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
|
|
||||||
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
|
|
||||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
|
|
||||||
|
|
||||||
- name: Upload Cuda runtime
|
- name: Upload Cuda runtime
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
@ -469,6 +452,22 @@ jobs:
|
||||||
path: |
|
path: |
|
||||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
|
freeBSD-latest:
|
||||||
|
runs-on: macos-12
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
uses: cross-platform-actions/action@v0.19.0
|
||||||
|
with:
|
||||||
|
operating_system: freebsd
|
||||||
|
version: '13.2'
|
||||||
|
run: |
|
||||||
|
sudo pkg update
|
||||||
|
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||||
|
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
|
||||||
|
|
||||||
release:
|
release:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
||||||
|
@ -485,7 +484,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -543,7 +542,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
# run: |
|
# run: |
|
||||||
|
@ -567,7 +566,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
# run: |
|
# run: |
|
||||||
|
@ -591,7 +590,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
# run: |
|
# run: |
|
||||||
|
@ -621,7 +620,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Add msbuild to PATH
|
# - name: Add msbuild to PATH
|
||||||
# uses: microsoft/setup-msbuild@v1
|
# uses: microsoft/setup-msbuild@v1
|
||||||
|
@ -660,7 +659,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Add msbuild to PATH
|
# - name: Add msbuild to PATH
|
||||||
# uses: microsoft/setup-msbuild@v1
|
# uses: microsoft/setup-msbuild@v1
|
||||||
|
@ -706,7 +705,7 @@ jobs:
|
||||||
#
|
#
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v1
|
# uses: actions/checkout@v3
|
||||||
#
|
#
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
# run: |
|
# run: |
|
||||||
|
|
15
.github/workflows/docker.yml
vendored
15
.github/workflows/docker.yml
vendored
|
@ -26,8 +26,15 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
|
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||||
|
# have disabled them for now until the reason why
|
||||||
|
# is understood.
|
||||||
|
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
@ -51,7 +58,7 @@ jobs:
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
||||||
|
@ -60,6 +67,6 @@ jobs:
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
2
.github/workflows/gguf-publish.yml
vendored
2
.github/workflows/gguf-publish.yml
vendored
|
@ -24,7 +24,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
|
|
100
CMakeLists.txt
100
CMakeLists.txt
|
@ -135,6 +135,7 @@ set(CMAKE_C_STANDARD 11)
|
||||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
|
include(CheckCXXCompilerFlag)
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
if (LLAMA_SANITIZE_THREAD)
|
if (LLAMA_SANITIZE_THREAD)
|
||||||
|
@ -171,8 +172,8 @@ if (LLAMA_METAL)
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
|
||||||
message(STATUS "Metal framework found")
|
message(STATUS "Metal framework found")
|
||||||
|
set(GGML_HEADERS_METAL ggml-metal.h)
|
||||||
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
|
set(GGML_SOURCES_METAL ggml-metal.m)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_METAL)
|
add_compile_definitions(GGML_USE_METAL)
|
||||||
if (LLAMA_METAL_NDEBUG)
|
if (LLAMA_METAL_NDEBUG)
|
||||||
|
@ -191,7 +192,6 @@ if (LLAMA_METAL)
|
||||||
${METALKIT_FRAMEWORK}
|
${METALKIT_FRAMEWORK}
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BLAS)
|
if (LLAMA_BLAS)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
set(BLA_STATIC ON)
|
set(BLA_STATIC ON)
|
||||||
|
@ -268,7 +268,8 @@ if (LLAMA_BLAS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_K_QUANTS)
|
if (LLAMA_K_QUANTS)
|
||||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
set(GGML_HEADERS_EXTRA k_quants.h)
|
||||||
|
set(GGML_SOURCES_EXTRA k_quants.c)
|
||||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||||
if (LLAMA_QKK_64)
|
if (LLAMA_QKK_64)
|
||||||
add_compile_definitions(GGML_QKK_64)
|
add_compile_definitions(GGML_QKK_64)
|
||||||
|
@ -284,7 +285,8 @@ if (LLAMA_CUBLAS)
|
||||||
|
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
set(GGML_HEADERS_CUDA ggml-cuda.h)
|
||||||
|
set(GGML_SOURCES_CUDA ggml-cuda.cu)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
# if (LLAMA_CUDA_CUBLAS)
|
# if (LLAMA_CUDA_CUBLAS)
|
||||||
|
@ -332,6 +334,7 @@ if (LLAMA_MPI)
|
||||||
find_package(MPI)
|
find_package(MPI)
|
||||||
if (MPI_C_FOUND)
|
if (MPI_C_FOUND)
|
||||||
message(STATUS "MPI found")
|
message(STATUS "MPI found")
|
||||||
|
set(GGML_HEADERS_MPI ggml-mpi.h)
|
||||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
||||||
add_compile_definitions(GGML_USE_MPI)
|
add_compile_definitions(GGML_USE_MPI)
|
||||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||||
|
@ -354,7 +357,8 @@ if (LLAMA_CLBLAST)
|
||||||
if (CLBlast_FOUND)
|
if (CLBlast_FOUND)
|
||||||
message(STATUS "CLBlast found")
|
message(STATUS "CLBlast found")
|
||||||
|
|
||||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
|
set(GGML_HEADERS_OPENCL ggml-opencl.h)
|
||||||
|
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CLBLAST)
|
add_compile_definitions(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
|
@ -382,13 +386,15 @@ if (LLAMA_HIPBLAS)
|
||||||
message(STATUS "HIP and hipBLAS found")
|
message(STATUS "HIP and hipBLAS found")
|
||||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
||||||
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
endif()
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
|
|
||||||
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
||||||
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||||
|
|
||||||
|
@ -439,7 +445,7 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MSVC)
|
if (WIN32)
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
@ -461,6 +467,13 @@ endif()
|
||||||
# TODO: probably these flags need to be tweaked on some architectures
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||||
|
if (MSVC)
|
||||||
|
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
|
||||||
|
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
|
||||||
|
else ()
|
||||||
|
set(CMAKE_GENERATOR_PLATFORM_LWR "")
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
add_link_options(-static)
|
add_link_options(-static)
|
||||||
|
@ -476,25 +489,33 @@ if (NOT MSVC)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64"))
|
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||||
message(STATUS "ARM detected")
|
message(STATUS "ARM detected")
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
# TODO: arm msvc?
|
add_compile_definitions(__ARM_NEON)
|
||||||
|
add_compile_definitions(__ARM_FEATURE_FMA)
|
||||||
|
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||||
|
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
|
||||||
|
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
|
||||||
else()
|
else()
|
||||||
|
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||||
|
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||||
|
add_compile_options(-mfp16-format=ieee)
|
||||||
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||||
# Raspberry Pi 1, Zero
|
# Raspberry Pi 1, Zero
|
||||||
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
|
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||||
# Raspberry Pi 2
|
# Raspberry Pi 2
|
||||||
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
|
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||||
endif()
|
endif()
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||||
add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
|
add_compile_options(-mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||||
message(STATUS "x86 detected")
|
message(STATUS "x86 detected")
|
||||||
if (MSVC)
|
if (MSVC)
|
||||||
if (LLAMA_AVX512)
|
if (LLAMA_AVX512)
|
||||||
|
@ -616,11 +637,11 @@ add_library(ggml OBJECT
|
||||||
ggml.h
|
ggml.h
|
||||||
ggml-alloc.c
|
ggml-alloc.c
|
||||||
ggml-alloc.h
|
ggml-alloc.h
|
||||||
${GGML_SOURCES_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_MPI}
|
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||||
${GGML_SOURCES_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||||
|
@ -658,14 +679,53 @@ if (BUILD_SHARED_LIBS)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||||
endif()
|
endif()
|
||||||
install(TARGETS llama LIBRARY)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# install
|
# install
|
||||||
#
|
#
|
||||||
|
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
|
include(CMakePackageConfigHelpers)
|
||||||
|
|
||||||
|
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
|
||||||
|
CACHE PATH "Location of header files")
|
||||||
|
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
|
||||||
|
CACHE PATH "Location of library files")
|
||||||
|
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
|
||||||
|
CACHE PATH "Location of binary files")
|
||||||
|
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||||
|
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||||
|
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||||
|
|
||||||
|
configure_package_config_file(
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||||
|
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
|
||||||
|
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
|
||||||
|
LLAMA_LIB_INSTALL_DIR
|
||||||
|
LLAMA_BIN_INSTALL_DIR )
|
||||||
|
|
||||||
|
write_basic_package_version_file(
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||||
|
VERSION ${LLAMA_INSTALL_VERSION}
|
||||||
|
COMPATIBILITY SameMajorVersion)
|
||||||
|
|
||||||
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||||
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||||
|
|
||||||
|
set(GGML_PUBLIC_HEADERS "ggml.h"
|
||||||
|
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||||
|
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||||
|
|
||||||
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
|
install(TARGETS ggml PUBLIC_HEADER)
|
||||||
|
|
||||||
|
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
|
||||||
|
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||||
|
|
||||||
install(
|
install(
|
||||||
FILES convert.py
|
FILES convert.py
|
||||||
PERMISSIONS
|
PERMISSIONS
|
||||||
|
|
33
Makefile
33
Makefile
|
@ -2,7 +2,7 @@
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
|
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -49,7 +49,7 @@ test: $(TEST_TARGETS)
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
||||||
continue; \
|
continue; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||||
continue; \
|
continue; \
|
||||||
else \
|
else \
|
||||||
echo "Running test $$test_target..."; \
|
echo "Running test $$test_target..."; \
|
||||||
|
@ -110,50 +110,42 @@ MK_LDFLAGS =
|
||||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||||
MK_CFLAGS += -D_XOPEN_SOURCE=600
|
MK_CPPFLAGS += -D_XOPEN_SOURCE=600
|
||||||
MK_CXXFLAGS += -D_XOPEN_SOURCE=600
|
|
||||||
|
|
||||||
# Somehow in OpenBSD whenever POSIX conformance is specified
|
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||||
# some string functions rely on locale_t availability,
|
# some string functions rely on locale_t availability,
|
||||||
# which was introduced in POSIX.1-2008, forcing us to go higher
|
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||||
ifeq ($(UNAME_S),OpenBSD)
|
ifeq ($(UNAME_S),OpenBSD)
|
||||||
MK_CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||||
MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Data types, macros and functions related to controlling CPU affinity and
|
# Data types, macros and functions related to controlling CPU affinity and
|
||||||
# some memory allocation are available on Linux through GNU extensions in libc
|
# some memory allocation are available on Linux through GNU extensions in libc
|
||||||
ifeq ($(UNAME_S),Linux)
|
ifeq ($(UNAME_S),Linux)
|
||||||
MK_CFLAGS += -D_GNU_SOURCE
|
MK_CPPFLAGS += -D_GNU_SOURCE
|
||||||
MK_CXXFLAGS += -D_GNU_SOURCE
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||||
# and on macOS its availability depends on enabling Darwin extensions
|
# and on macOS its availability depends on enabling Darwin extensions
|
||||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
MK_CFLAGS += -D_DARWIN_C_SOURCE
|
MK_CPPFLAGS += -D_DARWIN_C_SOURCE
|
||||||
MK_CXXFLAGS += -D_DARWIN_C_SOURCE
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(UNAME_S),DragonFly)
|
ifeq ($(UNAME_S),DragonFly)
|
||||||
MK_CFLAGS += -D__BSD_VISIBLE
|
MK_CPPFLAGS += -D__BSD_VISIBLE
|
||||||
MK_CXXFLAGS += -D__BSD_VISIBLE
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# alloca is a non-standard interface that is not visible on BSDs when
|
# alloca is a non-standard interface that is not visible on BSDs when
|
||||||
# POSIX conformance is specified, but not all of them provide a clean way
|
# POSIX conformance is specified, but not all of them provide a clean way
|
||||||
# to enable it in such cases
|
# to enable it in such cases
|
||||||
ifeq ($(UNAME_S),FreeBSD)
|
ifeq ($(UNAME_S),FreeBSD)
|
||||||
MK_CFLAGS += -D__BSD_VISIBLE
|
MK_CPPFLAGS += -D__BSD_VISIBLE
|
||||||
MK_CXXFLAGS += -D__BSD_VISIBLE
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(UNAME_S),NetBSD)
|
ifeq ($(UNAME_S),NetBSD)
|
||||||
MK_CFLAGS += -D_NETBSD_SOURCE
|
MK_CPPFLAGS += -D_NETBSD_SOURCE
|
||||||
MK_CXXFLAGS += -D_NETBSD_SOURCE
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(UNAME_S),OpenBSD)
|
ifeq ($(UNAME_S),OpenBSD)
|
||||||
MK_CFLAGS += -D_BSD_SOURCE
|
MK_CPPFLAGS += -D_BSD_SOURCE
|
||||||
MK_CXXFLAGS += -D_BSD_SOURCE
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
|
@ -182,7 +174,7 @@ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow
|
||||||
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
|
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
|
||||||
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
|
||||||
|
|
||||||
ifeq '' '$(findstring clang++,$(CXX))'
|
ifeq '' '$(findstring clang,$(shell $(CXX) --version))'
|
||||||
# g++ only
|
# g++ only
|
||||||
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
|
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
|
||||||
endif
|
endif
|
||||||
|
@ -408,7 +400,6 @@ ifdef LLAMA_HIPBLAS
|
||||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||||
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||||
HIPFLAGS += -DCC_TURING=1000000000
|
|
||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
endif # LLAMA_CUDA_FORCE_DMMV
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
|
@ -606,7 +597,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
|
|
13
README.md
13
README.md
|
@ -844,8 +844,17 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
|
||||||
#### Images
|
#### Images
|
||||||
We have two Docker images available for this project:
|
We have two Docker images available for this project:
|
||||||
|
|
||||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
|
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
|
Additionally, there the following images, similar to the above:
|
||||||
|
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
#### Usage
|
#### Usage
|
||||||
|
|
||||||
|
|
|
@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
|
#endif
|
||||||
|
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
|
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
#endif
|
#endif
|
||||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -423,8 +434,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--mtest") {
|
|
||||||
params.mem_test = true;
|
|
||||||
} else if (arg == "--numa") {
|
} else if (arg == "--numa") {
|
||||||
params.numa = true;
|
params.numa = true;
|
||||||
} else if (arg == "--export") {
|
} else if (arg == "--export") {
|
||||||
|
@ -664,6 +673,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
printf(" -ngl N, --n-gpu-layers N\n");
|
printf(" -ngl N, --n-gpu-layers N\n");
|
||||||
printf(" number of layers to store in VRAM\n");
|
printf(" number of layers to store in VRAM\n");
|
||||||
|
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
||||||
|
printf(" number of layers to store in VRAM for the draft model\n");
|
||||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||||
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||||
|
@ -674,7 +685,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#endif
|
#endif
|
||||||
printf(" --mtest compute maximum memory usage\n");
|
|
||||||
printf(" --export export the computation graph to 'llama.ggml'\n");
|
printf(" --export export the computation graph to 'llama.ggml'\n");
|
||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
|
@ -1212,7 +1222,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
||||||
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
||||||
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
||||||
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
|
|
||||||
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
||||||
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
||||||
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
||||||
|
|
|
@ -38,6 +38,7 @@ struct gpt_params {
|
||||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
@ -109,7 +110,6 @@ struct gpt_params {
|
||||||
bool perplexity = false; // compute perplexity over the prompt
|
bool perplexity = false; // compute perplexity over the prompt
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool mem_test = false; // compute maximum memory usage
|
|
||||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||||
bool export_cgraph = false; // export the computation graph
|
bool export_cgraph = false; // export the computation graph
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
|
|
292
convert-baichuan-hf-to-gguf.py
Executable file
292
convert-baichuan-hf-to-gguf.py
Executable file
|
@ -0,0 +1,292 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# HF baichuan --> gguf conversion
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
import itertools
|
||||||
|
import gguf
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from typing import TypeAlias
|
||||||
|
|
||||||
|
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||||
|
|
||||||
|
# reverse HF permute back to original pth layout
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
||||||
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
n_head //= n_kv_head
|
||||||
|
|
||||||
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
||||||
|
r = weights.shape[0] // 3
|
||||||
|
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
||||||
|
|
||||||
|
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
||||||
|
r = weights.shape[0] // 3
|
||||||
|
return weights[r * n_part : r * n_part + r, ...]
|
||||||
|
|
||||||
|
def count_model_parts(dir_model: str) -> int:
|
||||||
|
num_parts = 0
|
||||||
|
|
||||||
|
for filename in os.listdir(dir_model):
|
||||||
|
if filename.startswith("pytorch_model-"):
|
||||||
|
num_parts += 1
|
||||||
|
|
||||||
|
if num_parts > 0:
|
||||||
|
print("gguf: found " + str(num_parts) + " model parts")
|
||||||
|
|
||||||
|
return num_parts
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
||||||
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||||
|
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
dir_model = args.model
|
||||||
|
ftype = args.ftype
|
||||||
|
if not dir_model.is_dir():
|
||||||
|
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# possible tensor data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
if args.outfile is not None:
|
||||||
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
|
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||||
|
|
||||||
|
print("gguf: loading model "+dir_model.name)
|
||||||
|
|
||||||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
print("hello print: ",hparams["architectures"][0])
|
||||||
|
if hparams["architectures"][0] != "BaichuanForCausalLM":
|
||||||
|
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||||
|
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
# get number of model parts
|
||||||
|
num_parts = count_model_parts(dir_model)
|
||||||
|
print(f"num_parts:{num_parts}\n")
|
||||||
|
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
||||||
|
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
|
print("gguf: get model metadata")
|
||||||
|
|
||||||
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
head_count = hparams["num_attention_heads"]
|
||||||
|
|
||||||
|
if "num_key_value_heads" in hparams:
|
||||||
|
head_count_kv = hparams["num_key_value_heads"]
|
||||||
|
else:
|
||||||
|
head_count_kv = head_count
|
||||||
|
|
||||||
|
if "_name_or_path" in hparams:
|
||||||
|
hf_repo = hparams["_name_or_path"]
|
||||||
|
else:
|
||||||
|
hf_repo = ""
|
||||||
|
|
||||||
|
if "max_sequence_length" in hparams:
|
||||||
|
ctx_length = hparams["max_sequence_length"]
|
||||||
|
elif "max_position_embeddings" in hparams:
|
||||||
|
ctx_length = hparams["max_position_embeddings"]
|
||||||
|
elif "model_max_length" in hparams:
|
||||||
|
ctx_length = hparams["model_max_length"]
|
||||||
|
else:
|
||||||
|
print("gguf: can not find ctx length parameter.")
|
||||||
|
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
gguf_writer.add_name(dir_model.name)
|
||||||
|
gguf_writer.add_source_hf_repo(hf_repo)
|
||||||
|
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||||
|
gguf_writer.add_context_length(ctx_length)
|
||||||
|
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
|
gguf_writer.add_block_count(block_count)
|
||||||
|
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
|
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||||
|
gguf_writer.add_head_count(head_count)
|
||||||
|
gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
|
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
|
|
||||||
|
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||||
|
if "type" in hparams["rope_scaling"]:
|
||||||
|
if hparams["rope_scaling"]["type"] == "linear":
|
||||||
|
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||||
|
|
||||||
|
|
||||||
|
# TOKENIZATION
|
||||||
|
|
||||||
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
|
tokens: list[bytes] = []
|
||||||
|
scores: list[float] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
tokenizer_model_file = dir_model / 'tokenizer.model'
|
||||||
|
if not tokenizer_model_file.is_file():
|
||||||
|
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# vocab type sentencepiece
|
||||||
|
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
||||||
|
|
||||||
|
for i in range(tokenizer.vocab_size()):
|
||||||
|
text: bytes
|
||||||
|
score: float
|
||||||
|
|
||||||
|
piece = tokenizer.id_to_piece(i)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.get_score(i)
|
||||||
|
|
||||||
|
toktype = 1 # defualt to normal token type
|
||||||
|
if tokenizer.is_unknown(i):
|
||||||
|
toktype = 2
|
||||||
|
if tokenizer.is_control(i):
|
||||||
|
toktype = 3
|
||||||
|
|
||||||
|
# toktype = 4 is user-defined = tokens from added_tokens.json
|
||||||
|
|
||||||
|
if tokenizer.is_unused(i):
|
||||||
|
toktype = 5
|
||||||
|
if tokenizer.is_byte(i):
|
||||||
|
toktype = 6
|
||||||
|
|
||||||
|
tokens.append(text)
|
||||||
|
scores.append(score)
|
||||||
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
added_tokens_file = dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
addtokens_json = json.load(f)
|
||||||
|
|
||||||
|
print("gguf: get added tokens")
|
||||||
|
|
||||||
|
for key in addtokens_json:
|
||||||
|
tokens.append( key.encode("utf-8") )
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(4) # user-defined token type
|
||||||
|
|
||||||
|
|
||||||
|
gguf_writer.add_tokenizer_model("llama")
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model)
|
||||||
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
|
# TENSORS
|
||||||
|
|
||||||
|
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||||
|
|
||||||
|
# tensor info
|
||||||
|
print("gguf: get tensor metadata")
|
||||||
|
|
||||||
|
if num_parts == 0:
|
||||||
|
part_names = iter(("pytorch_model.bin",))
|
||||||
|
else:
|
||||||
|
part_names = (
|
||||||
|
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
for part_name in part_names:
|
||||||
|
if args.vocab_only:
|
||||||
|
break
|
||||||
|
print("gguf: loading model part '" + part_name + "'")
|
||||||
|
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
||||||
|
|
||||||
|
tmp=model_part
|
||||||
|
for i in range(block_count):
|
||||||
|
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
||||||
|
print(f"Unpacking and permuting layer {i}")
|
||||||
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
||||||
|
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
||||||
|
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
||||||
|
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
||||||
|
|
||||||
|
for name in model_part.keys():
|
||||||
|
data = model_part[name]
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith(".rotary_emb.inv_freq"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||||
|
data = data.to(torch.float32)
|
||||||
|
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print("Can not map tensor '" + name + "'")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||||
|
gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
print("gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print("gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
if not args.vocab_only:
|
||||||
|
print("gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||||
|
print("")
|
|
@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
51
examples/main-cmake-pkg/.gitignore
vendored
Normal file
51
examples/main-cmake-pkg/.gitignore
vendored
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# Prerequisites
|
||||||
|
*.d
|
||||||
|
|
||||||
|
# Compiled Object files
|
||||||
|
*.slo
|
||||||
|
*.lo
|
||||||
|
*.o
|
||||||
|
*.obj
|
||||||
|
|
||||||
|
# Precompiled Headers
|
||||||
|
*.gch
|
||||||
|
*.pch
|
||||||
|
|
||||||
|
# Compiled Dynamic libraries
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
*.dll
|
||||||
|
|
||||||
|
# Fortran module files
|
||||||
|
*.mod
|
||||||
|
*.smod
|
||||||
|
|
||||||
|
# Compiled Static libraries
|
||||||
|
*.lai
|
||||||
|
*.la
|
||||||
|
*.a
|
||||||
|
*.lib
|
||||||
|
|
||||||
|
# Executables
|
||||||
|
*.exe
|
||||||
|
*.out
|
||||||
|
*.app
|
||||||
|
|
||||||
|
*.gguf
|
||||||
|
|
||||||
|
*.log
|
||||||
|
.DS_Store
|
||||||
|
.build/
|
||||||
|
.cache/
|
||||||
|
.direnv/
|
||||||
|
.envrc
|
||||||
|
.swiftpm
|
||||||
|
.venv
|
||||||
|
.clang-tidy
|
||||||
|
.vs/
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
build*/
|
||||||
|
out/
|
||||||
|
tmp/
|
||||||
|
|
36
examples/main-cmake-pkg/CMakeLists.txt
Normal file
36
examples/main-cmake-pkg/CMakeLists.txt
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
cmake_minimum_required(VERSION 3.12)
|
||||||
|
project("main-cmake-pkg" C CXX)
|
||||||
|
set(TARGET main-cmake-pkg)
|
||||||
|
|
||||||
|
find_package(Llama 0.0.1 REQUIRED)
|
||||||
|
|
||||||
|
# Bake common functionality in with target. Because applications
|
||||||
|
# using the relocatable Llama package should be outside of the
|
||||||
|
# source tree, main-cmake-pkg pretends the dependencies are built-in.
|
||||||
|
|
||||||
|
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
||||||
|
add_library(common OBJECT
|
||||||
|
${_common_path}/common.h
|
||||||
|
${_common_path}/common.cpp
|
||||||
|
${_common_path}/console.h
|
||||||
|
${_common_path}/console.cpp
|
||||||
|
${_common_path}/grammar-parser.h
|
||||||
|
${_common_path}/grammar-parser.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
# WARNING: because build-info.h is auto-generated, it will only
|
||||||
|
# be available after the user has built the llama.cpp sources.
|
||||||
|
#
|
||||||
|
configure_file(${_common_path}/../build-info.h
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/build-info.h
|
||||||
|
COPYONLY)
|
||||||
|
|
||||||
|
target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
|
||||||
|
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
|
||||||
|
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
37
examples/main-cmake-pkg/README.md
Normal file
37
examples/main-cmake-pkg/README.md
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
# llama.cpp/example/main-cmake-pkg
|
||||||
|
|
||||||
|
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
|
||||||
|
|
||||||
|
### Considerations
|
||||||
|
|
||||||
|
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||||
|
|
||||||
|
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||||
|
|
||||||
|
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
|
||||||
|
|
||||||
|
```cmd
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
|
||||||
|
cmake --build . --config Release
|
||||||
|
cmake --install . --prefix C:/LlamaCPP
|
||||||
|
```
|
||||||
|
|
||||||
|
### Build main-cmake-pkg
|
||||||
|
|
||||||
|
|
||||||
|
```cmd
|
||||||
|
cd ..\examples\main-cmake-pkg
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||||
|
cmake --build . --config Release
|
||||||
|
cmake --install . --prefix C:/MyLlamaApp
|
||||||
|
```
|
|
@ -144,7 +144,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
|
||||||
|
|
||||||
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||||
|
|
||||||
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
||||||
|
|
||||||
### Keep Prompt
|
### Keep Prompt
|
||||||
|
|
||||||
|
@ -274,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||||
|
|
||||||
### NUMA support
|
### NUMA support
|
||||||
|
|
||||||
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
|
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
|
||||||
|
|
||||||
### Memory Float 32
|
### Memory Float 32
|
||||||
|
|
||||||
|
@ -302,7 +302,6 @@ These options provide extra functionality and customization when running the LLa
|
||||||
|
|
||||||
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
||||||
- `--verbose-prompt`: Print the prompt before generating text.
|
- `--verbose-prompt`: Print the prompt before generating text.
|
||||||
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
|
|
||||||
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
|
|
|
@ -198,23 +198,6 @@ int main(int argc, char ** argv) {
|
||||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
|
|
||||||
// uncomment the "used_mem" line in llama.cpp to see the results
|
|
||||||
if (params.mem_test) {
|
|
||||||
{
|
|
||||||
LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
|
|
||||||
|
|
||||||
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
|
|
||||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// export the cgraph and exit
|
// export the cgraph and exit
|
||||||
if (params.export_cgraph) {
|
if (params.export_cgraph) {
|
||||||
llama_eval_export(ctx, "llama.ggml");
|
llama_eval_export(ctx, "llama.ggml");
|
||||||
|
|
|
@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.model = params.model_draft;
|
params.model = params.model_draft;
|
||||||
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
@ -81,7 +82,7 @@ int main(int argc, char ** argv) {
|
||||||
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
|
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
|
||||||
|
|
||||||
// how many tokens to draft each time
|
// how many tokens to draft each time
|
||||||
const int n_draft = params.n_draft;
|
int n_draft = params.n_draft;
|
||||||
|
|
||||||
int n_predict = 0;
|
int n_predict = 0;
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
|
@ -130,6 +131,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
|
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
|
||||||
|
|
||||||
int i_dft = 0;
|
int i_dft = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
||||||
|
@ -173,6 +175,27 @@ int main(int argc, char ** argv) {
|
||||||
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
|
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
|
|
||||||
|
// heuristic for n_draft
|
||||||
|
{
|
||||||
|
const int n_draft_cur = (int) drafted.size();
|
||||||
|
const bool all_accepted = i_dft == n_draft_cur;
|
||||||
|
|
||||||
|
LOG("n_draft = %d\n", n_draft);
|
||||||
|
LOG("n_draft_cur = %d\n", n_draft_cur);
|
||||||
|
LOG("i_dft = %d\n", i_dft);
|
||||||
|
LOG("all_accepted = %d\n", all_accepted);
|
||||||
|
|
||||||
|
if (all_accepted && n_draft == n_draft_cur) {
|
||||||
|
LOG(" - max drafted tokens accepted - n_draft += 8\n");
|
||||||
|
n_draft = std::min(30, n_draft + 8);
|
||||||
|
} else if (all_accepted) {
|
||||||
|
LOG(" - partially drafted tokens accepted - no change\n");
|
||||||
|
} else {
|
||||||
|
LOG(" - drafted token rejected - n_draft -= 1\n");
|
||||||
|
n_draft = std::max(2, n_draft - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
drafted.clear();
|
drafted.clear();
|
||||||
drafted.push_back(id);
|
drafted.push_back(id);
|
||||||
|
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
with pkgs; [ openblas ]
|
with pkgs; [ openblas ]
|
||||||
);
|
);
|
||||||
pkgs = import nixpkgs { inherit system; };
|
pkgs = import nixpkgs { inherit system; };
|
||||||
nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
|
nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
|
||||||
llama-python =
|
llama-python =
|
||||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
|
@ -45,6 +45,8 @@
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mv $out/bin/main $out/bin/llama
|
mv $out/bin/main $out/bin/llama
|
||||||
mv $out/bin/server $out/bin/llama-server
|
mv $out/bin/server $out/bin/llama-server
|
||||||
|
mkdir -p $out/include
|
||||||
|
cp ${src}/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||||
in
|
in
|
||||||
|
|
495
ggml-cuda.cu
495
ggml-cuda.cu
|
@ -13,7 +13,7 @@
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
// for rocblas_initialize()
|
// for rocblas_initialize()
|
||||||
#include "rocblas/rocblas.h"
|
#include "rocblas/rocblas.h"
|
||||||
#endif
|
#endif // __HIP_PLATFORM_AMD__
|
||||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||||
|
@ -68,19 +68,29 @@
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||||
#ifndef CC_TURING
|
#define CC_TURING 700
|
||||||
#define CC_TURING 700
|
#define CC_OFFSET_AMD 1000000
|
||||||
#endif
|
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#define __CUDA_ARCH__ 1300
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||||
|
defined(__gfx1150__) || defined(__gfx1151__)
|
||||||
|
#define RDNA3
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||||
|
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||||
|
#define RDNA2
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef __has_builtin
|
#ifndef __has_builtin
|
||||||
#define __has_builtin(x) 0
|
#define __has_builtin(x) 0
|
||||||
#endif
|
#endif
|
||||||
|
@ -132,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
#endif
|
#endif
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
#endif
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
@ -3472,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q4_0_RDNA2 64
|
||||||
|
#define MMQ_Y_Q4_0_RDNA2 128
|
||||||
|
#define NWARPS_Q4_0_RDNA2 8
|
||||||
|
#define MMQ_X_Q4_0_RDNA1 64
|
||||||
|
#define MMQ_Y_Q4_0_RDNA1 64
|
||||||
|
#define NWARPS_Q4_0_RDNA1 8
|
||||||
#define MMQ_X_Q4_0_AMPERE 64
|
#define MMQ_X_Q4_0_AMPERE 64
|
||||||
#define MMQ_Y_Q4_0_AMPERE 128
|
#define MMQ_Y_Q4_0_AMPERE 128
|
||||||
#define NWARPS_Q4_0_AMPERE 4
|
#define NWARPS_Q4_0_AMPERE 4
|
||||||
|
@ -3479,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||||
#define MMQ_Y_Q4_0_PASCAL 64
|
#define MMQ_Y_Q4_0_PASCAL 64
|
||||||
#define NWARPS_Q4_0_PASCAL 8
|
#define NWARPS_Q4_0_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q4_0(
|
template <bool need_check> static __global__ void
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
mul_mat_q4_0(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q4_0_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q4_0_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q4_0_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q4_0_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q4_0_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q4_0_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
||||||
|
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
||||||
const int nwarps = NWARPS_Q4_0_AMPERE;
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
||||||
|
@ -3506,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q4_1_RDNA2 64
|
||||||
|
#define MMQ_Y_Q4_1_RDNA2 128
|
||||||
|
#define NWARPS_Q4_1_RDNA2 8
|
||||||
|
#define MMQ_X_Q4_1_RDNA1 64
|
||||||
|
#define MMQ_Y_Q4_1_RDNA1 64
|
||||||
|
#define NWARPS_Q4_1_RDNA1 8
|
||||||
#define MMQ_X_Q4_1_AMPERE 64
|
#define MMQ_X_Q4_1_AMPERE 64
|
||||||
#define MMQ_Y_Q4_1_AMPERE 128
|
#define MMQ_Y_Q4_1_AMPERE 128
|
||||||
#define NWARPS_Q4_1_AMPERE 4
|
#define NWARPS_Q4_1_AMPERE 4
|
||||||
|
@ -3514,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
||||||
#define NWARPS_Q4_1_PASCAL 8
|
#define NWARPS_Q4_1_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void
|
template <bool need_check> static __global__ void
|
||||||
#if __CUDA_ARCH__ < CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#elif __CUDA_ARCH__ < CC_TURING
|
||||||
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
||||||
#endif // __CUDA_ARCH__ < CC_TURING
|
#endif // __CUDA_ARCH__ < CC_TURING
|
||||||
mul_mat_q4_1(
|
mul_mat_q4_1(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q4_1_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q4_1_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q4_1_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q4_1_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q4_1_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q4_1_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
||||||
|
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
||||||
const int nwarps = NWARPS_Q4_1_AMPERE;
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
||||||
|
@ -3544,6 +3606,12 @@ template <bool need_check> static __global__ void
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q5_0_RDNA2 64
|
||||||
|
#define MMQ_Y_Q5_0_RDNA2 128
|
||||||
|
#define NWARPS_Q5_0_RDNA2 8
|
||||||
|
#define MMQ_X_Q5_0_RDNA1 64
|
||||||
|
#define MMQ_Y_Q5_0_RDNA1 64
|
||||||
|
#define NWARPS_Q5_0_RDNA1 8
|
||||||
#define MMQ_X_Q5_0_AMPERE 128
|
#define MMQ_X_Q5_0_AMPERE 128
|
||||||
#define MMQ_Y_Q5_0_AMPERE 64
|
#define MMQ_Y_Q5_0_AMPERE 64
|
||||||
#define NWARPS_Q5_0_AMPERE 4
|
#define NWARPS_Q5_0_AMPERE 4
|
||||||
|
@ -3551,11 +3619,32 @@ template <bool need_check> static __global__ void
|
||||||
#define MMQ_Y_Q5_0_PASCAL 64
|
#define MMQ_Y_Q5_0_PASCAL 64
|
||||||
#define NWARPS_Q5_0_PASCAL 8
|
#define NWARPS_Q5_0_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q5_0(
|
template <bool need_check> static __global__ void
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
mul_mat_q5_0(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q5_0_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q5_0_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q5_0_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q5_0_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q5_0_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q5_0_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
||||||
|
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
||||||
const int nwarps = NWARPS_Q5_0_AMPERE;
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
||||||
|
@ -3578,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q5_1_RDNA2 64
|
||||||
|
#define MMQ_Y_Q5_1_RDNA2 128
|
||||||
|
#define NWARPS_Q5_1_RDNA2 8
|
||||||
|
#define MMQ_X_Q5_1_RDNA1 64
|
||||||
|
#define MMQ_Y_Q5_1_RDNA1 64
|
||||||
|
#define NWARPS_Q5_1_RDNA1 8
|
||||||
#define MMQ_X_Q5_1_AMPERE 128
|
#define MMQ_X_Q5_1_AMPERE 128
|
||||||
#define MMQ_Y_Q5_1_AMPERE 64
|
#define MMQ_Y_Q5_1_AMPERE 64
|
||||||
#define NWARPS_Q5_1_AMPERE 4
|
#define NWARPS_Q5_1_AMPERE 4
|
||||||
|
@ -3585,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
||||||
#define MMQ_Y_Q5_1_PASCAL 64
|
#define MMQ_Y_Q5_1_PASCAL 64
|
||||||
#define NWARPS_Q5_1_PASCAL 8
|
#define NWARPS_Q5_1_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q5_1(
|
template <bool need_check> static __global__ void
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
mul_mat_q5_1(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q5_1_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q5_1_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q5_1_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q5_1_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q5_1_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q5_1_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
||||||
|
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
||||||
const int nwarps = NWARPS_Q5_1_AMPERE;
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
||||||
|
@ -3612,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q8_0_RDNA2 64
|
||||||
|
#define MMQ_Y_Q8_0_RDNA2 128
|
||||||
|
#define NWARPS_Q8_0_RDNA2 8
|
||||||
|
#define MMQ_X_Q8_0_RDNA1 64
|
||||||
|
#define MMQ_Y_Q8_0_RDNA1 64
|
||||||
|
#define NWARPS_Q8_0_RDNA1 8
|
||||||
#define MMQ_X_Q8_0_AMPERE 128
|
#define MMQ_X_Q8_0_AMPERE 128
|
||||||
#define MMQ_Y_Q8_0_AMPERE 64
|
#define MMQ_Y_Q8_0_AMPERE 64
|
||||||
#define NWARPS_Q8_0_AMPERE 4
|
#define NWARPS_Q8_0_AMPERE 4
|
||||||
|
@ -3619,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
||||||
#define MMQ_Y_Q8_0_PASCAL 64
|
#define MMQ_Y_Q8_0_PASCAL 64
|
||||||
#define NWARPS_Q8_0_PASCAL 8
|
#define NWARPS_Q8_0_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q8_0(
|
template <bool need_check> static __global__ void
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
mul_mat_q8_0(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q8_0_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q8_0_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q8_0_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q8_0_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q8_0_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q8_0_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
||||||
|
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
||||||
const int nwarps = NWARPS_Q8_0_AMPERE;
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
||||||
|
@ -3646,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q2_K_RDNA2 64
|
||||||
|
#define MMQ_Y_Q2_K_RDNA2 128
|
||||||
|
#define NWARPS_Q2_K_RDNA2 8
|
||||||
|
#define MMQ_X_Q2_K_RDNA1 128
|
||||||
|
#define MMQ_Y_Q2_K_RDNA1 32
|
||||||
|
#define NWARPS_Q2_K_RDNA1 8
|
||||||
#define MMQ_X_Q2_K_AMPERE 64
|
#define MMQ_X_Q2_K_AMPERE 64
|
||||||
#define MMQ_Y_Q2_K_AMPERE 128
|
#define MMQ_Y_Q2_K_AMPERE 128
|
||||||
#define NWARPS_Q2_K_AMPERE 4
|
#define NWARPS_Q2_K_AMPERE 4
|
||||||
|
@ -3653,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
||||||
#define MMQ_Y_Q2_K_PASCAL 64
|
#define MMQ_Y_Q2_K_PASCAL 64
|
||||||
#define NWARPS_Q2_K_PASCAL 8
|
#define NWARPS_Q2_K_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q2_K(
|
template <bool need_check> static __global__ void
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
mul_mat_q2_K(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q2_K_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q2_K_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q2_K_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q2_K_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q2_K_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q2_K_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
||||||
|
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
||||||
const int nwarps = NWARPS_Q2_K_AMPERE;
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
||||||
|
@ -3680,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q3_K_RDNA2 128
|
||||||
|
#define MMQ_Y_Q3_K_RDNA2 64
|
||||||
|
#define NWARPS_Q3_K_RDNA2 8
|
||||||
|
#define MMQ_X_Q3_K_RDNA1 32
|
||||||
|
#define MMQ_Y_Q3_K_RDNA1 128
|
||||||
|
#define NWARPS_Q3_K_RDNA1 8
|
||||||
#define MMQ_X_Q3_K_AMPERE 128
|
#define MMQ_X_Q3_K_AMPERE 128
|
||||||
#define MMQ_Y_Q3_K_AMPERE 128
|
#define MMQ_Y_Q3_K_AMPERE 128
|
||||||
#define NWARPS_Q3_K_AMPERE 4
|
#define NWARPS_Q3_K_AMPERE 4
|
||||||
|
@ -3688,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
||||||
#define NWARPS_Q3_K_PASCAL 8
|
#define NWARPS_Q3_K_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void
|
template <bool need_check> static __global__ void
|
||||||
#if __CUDA_ARCH__ < CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#elif __CUDA_ARCH__ < CC_TURING
|
||||||
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
||||||
#endif // __CUDA_ARCH__ < CC_TURING
|
#endif // __CUDA_ARCH__ < CC_TURING
|
||||||
mul_mat_q3_K(
|
mul_mat_q3_K(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q3_K_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q3_K_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q3_K_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q3_K_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q3_K_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q3_K_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
||||||
|
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
||||||
const int nwarps = NWARPS_Q3_K_AMPERE;
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
||||||
|
@ -3718,6 +3913,12 @@ template <bool need_check> static __global__ void
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q4_K_RDNA2 64
|
||||||
|
#define MMQ_Y_Q4_K_RDNA2 128
|
||||||
|
#define NWARPS_Q4_K_RDNA2 8
|
||||||
|
#define MMQ_X_Q4_K_RDNA1 32
|
||||||
|
#define MMQ_Y_Q4_K_RDNA1 64
|
||||||
|
#define NWARPS_Q4_K_RDNA1 8
|
||||||
#define MMQ_X_Q4_K_AMPERE 64
|
#define MMQ_X_Q4_K_AMPERE 64
|
||||||
#define MMQ_Y_Q4_K_AMPERE 128
|
#define MMQ_Y_Q4_K_AMPERE 128
|
||||||
#define NWARPS_Q4_K_AMPERE 4
|
#define NWARPS_Q4_K_AMPERE 4
|
||||||
|
@ -3726,14 +3927,33 @@ template <bool need_check> static __global__ void
|
||||||
#define NWARPS_Q4_K_PASCAL 8
|
#define NWARPS_Q4_K_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void
|
template <bool need_check> static __global__ void
|
||||||
#if __CUDA_ARCH__ < CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#elif __CUDA_ARCH__ < CC_TURING
|
||||||
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
||||||
#endif // __CUDA_ARCH__ < CC_TURING
|
#endif // __CUDA_ARCH__ < CC_TURING
|
||||||
mul_mat_q4_K(
|
mul_mat_q4_K(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q4_K_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q4_K_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q4_K_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q4_K_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q4_K_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q4_K_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
||||||
|
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
||||||
const int nwarps = NWARPS_Q4_K_AMPERE;
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
||||||
|
@ -3756,6 +3976,12 @@ template <bool need_check> static __global__ void
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q5_K_RDNA2 64
|
||||||
|
#define MMQ_Y_Q5_K_RDNA2 128
|
||||||
|
#define NWARPS_Q5_K_RDNA2 8
|
||||||
|
#define MMQ_X_Q5_K_RDNA1 32
|
||||||
|
#define MMQ_Y_Q5_K_RDNA1 64
|
||||||
|
#define NWARPS_Q5_K_RDNA1 8
|
||||||
#define MMQ_X_Q5_K_AMPERE 64
|
#define MMQ_X_Q5_K_AMPERE 64
|
||||||
#define MMQ_Y_Q5_K_AMPERE 128
|
#define MMQ_Y_Q5_K_AMPERE 128
|
||||||
#define NWARPS_Q5_K_AMPERE 4
|
#define NWARPS_Q5_K_AMPERE 4
|
||||||
|
@ -3763,11 +3989,32 @@ template <bool need_check> static __global__ void
|
||||||
#define MMQ_Y_Q5_K_PASCAL 64
|
#define MMQ_Y_Q5_K_PASCAL 64
|
||||||
#define NWARPS_Q5_K_PASCAL 8
|
#define NWARPS_Q5_K_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void mul_mat_q5_K(
|
template <bool need_check> static __global__ void
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
mul_mat_q5_K(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q5_K_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q5_K_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q5_K_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q5_K_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q5_K_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q5_K_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
||||||
|
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
||||||
const int nwarps = NWARPS_Q5_K_AMPERE;
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
||||||
|
@ -3790,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
||||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MMQ_X_Q6_K_RDNA2 64
|
||||||
|
#define MMQ_Y_Q6_K_RDNA2 128
|
||||||
|
#define NWARPS_Q6_K_RDNA2 8
|
||||||
|
#define MMQ_X_Q6_K_RDNA1 32
|
||||||
|
#define MMQ_Y_Q6_K_RDNA1 64
|
||||||
|
#define NWARPS_Q6_K_RDNA1 8
|
||||||
#define MMQ_X_Q6_K_AMPERE 64
|
#define MMQ_X_Q6_K_AMPERE 64
|
||||||
#define MMQ_Y_Q6_K_AMPERE 64
|
#define MMQ_Y_Q6_K_AMPERE 64
|
||||||
#define NWARPS_Q6_K_AMPERE 4
|
#define NWARPS_Q6_K_AMPERE 4
|
||||||
|
@ -3798,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
||||||
#define NWARPS_Q6_K_PASCAL 8
|
#define NWARPS_Q6_K_PASCAL 8
|
||||||
|
|
||||||
template <bool need_check> static __global__ void
|
template <bool need_check> static __global__ void
|
||||||
#if __CUDA_ARCH__ < CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
#elif __CUDA_ARCH__ < CC_TURING
|
||||||
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
||||||
#endif // __CUDA_ARCH__ < CC_TURING
|
#endif // __CUDA_ARCH__ < CC_TURING
|
||||||
mul_mat_q6_K(
|
mul_mat_q6_K(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= CC_TURING
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(RDNA3) || defined(RDNA2)
|
||||||
|
const int mmq_x = MMQ_X_Q6_K_RDNA2;
|
||||||
|
const int mmq_y = MMQ_Y_Q6_K_RDNA2;
|
||||||
|
const int nwarps = NWARPS_Q6_K_RDNA2;
|
||||||
|
#else
|
||||||
|
const int mmq_x = MMQ_X_Q6_K_RDNA1;
|
||||||
|
const int mmq_y = MMQ_Y_Q6_K_RDNA1;
|
||||||
|
const int nwarps = NWARPS_Q6_K_RDNA1;
|
||||||
|
#endif // defined(RDNA3) || defined(RDNA2)
|
||||||
|
|
||||||
|
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
||||||
|
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
||||||
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||||
|
|
||||||
|
#elif __CUDA_ARCH__ >= CC_TURING
|
||||||
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
||||||
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
||||||
const int nwarps = NWARPS_Q6_K_AMPERE;
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
||||||
|
@ -4588,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q4_0_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q4_0_RDNA2;
|
||||||
|
nwarps = NWARPS_Q4_0_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q4_0_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
||||||
|
nwarps = NWARPS_Q4_0_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q4_0_AMPERE;
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
||||||
nwarps = NWARPS_Q4_0_AMPERE;
|
nwarps = NWARPS_Q4_0_AMPERE;
|
||||||
|
@ -4625,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q4_1_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q4_1_RDNA2;
|
||||||
|
nwarps = NWARPS_Q4_1_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q4_1_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
||||||
|
nwarps = NWARPS_Q4_1_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q4_1_AMPERE;
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
||||||
nwarps = NWARPS_Q4_1_AMPERE;
|
nwarps = NWARPS_Q4_1_AMPERE;
|
||||||
|
@ -4662,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q5_0_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q5_0_RDNA2;
|
||||||
|
nwarps = NWARPS_Q5_0_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q5_0_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
||||||
|
nwarps = NWARPS_Q5_0_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q5_0_AMPERE;
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
||||||
nwarps = NWARPS_Q5_0_AMPERE;
|
nwarps = NWARPS_Q5_0_AMPERE;
|
||||||
|
@ -4699,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q5_1_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q5_1_RDNA2;
|
||||||
|
nwarps = NWARPS_Q5_1_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q5_1_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
||||||
|
nwarps = NWARPS_Q5_1_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q5_1_AMPERE;
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
||||||
nwarps = NWARPS_Q5_1_AMPERE;
|
nwarps = NWARPS_Q5_1_AMPERE;
|
||||||
|
@ -4736,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q8_0_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q8_0_RDNA2;
|
||||||
|
nwarps = NWARPS_Q8_0_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q8_0_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
||||||
|
nwarps = NWARPS_Q8_0_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q8_0_AMPERE;
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
||||||
nwarps = NWARPS_Q8_0_AMPERE;
|
nwarps = NWARPS_Q8_0_AMPERE;
|
||||||
|
@ -4773,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q2_K_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q2_K_RDNA2;
|
||||||
|
nwarps = NWARPS_Q2_K_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q2_K_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
||||||
|
nwarps = NWARPS_Q2_K_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q2_K_AMPERE;
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
||||||
nwarps = NWARPS_Q2_K_AMPERE;
|
nwarps = NWARPS_Q2_K_AMPERE;
|
||||||
|
@ -4812,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q3_K_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q3_K_RDNA2;
|
||||||
|
nwarps = NWARPS_Q3_K_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q3_K_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
||||||
|
nwarps = NWARPS_Q3_K_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q3_K_AMPERE;
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
||||||
nwarps = NWARPS_Q3_K_AMPERE;
|
nwarps = NWARPS_Q3_K_AMPERE;
|
||||||
|
@ -4850,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q4_K_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q4_K_RDNA2;
|
||||||
|
nwarps = NWARPS_Q4_K_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q4_K_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
||||||
|
nwarps = NWARPS_Q4_K_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q4_K_AMPERE;
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
||||||
nwarps = NWARPS_Q4_K_AMPERE;
|
nwarps = NWARPS_Q4_K_AMPERE;
|
||||||
|
@ -4887,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q5_K_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q5_K_RDNA2;
|
||||||
|
nwarps = NWARPS_Q5_K_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q5_K_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
||||||
|
nwarps = NWARPS_Q5_K_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q5_K_AMPERE;
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
||||||
nwarps = NWARPS_Q5_K_AMPERE;
|
nwarps = NWARPS_Q5_K_AMPERE;
|
||||||
|
@ -4924,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_compute_capabilities[id];
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_TURING) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
mmq_x = MMQ_X_Q6_K_RDNA2;
|
||||||
|
mmq_y = MMQ_Y_Q6_K_RDNA2;
|
||||||
|
nwarps = NWARPS_Q6_K_RDNA2;
|
||||||
|
} else if (compute_capability >= CC_OFFSET_AMD) {
|
||||||
|
mmq_x = MMQ_X_Q6_K_RDNA1;
|
||||||
|
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
||||||
|
nwarps = NWARPS_Q6_K_RDNA1;
|
||||||
|
} else if (compute_capability >= CC_TURING) {
|
||||||
mmq_x = MMQ_X_Q6_K_AMPERE;
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
||||||
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
||||||
nwarps = NWARPS_Q6_K_AMPERE;
|
nwarps = NWARPS_Q6_K_AMPERE;
|
||||||
|
@ -5165,8 +5517,11 @@ void ggml_init_cublas() {
|
||||||
|
|
||||||
g_tensor_split[id] = total_vram;
|
g_tensor_split[id] = total_vram;
|
||||||
total_vram += prop.totalGlobalMem;
|
total_vram += prop.totalGlobalMem;
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
||||||
|
#else
|
||||||
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
g_tensor_split[id] /= total_vram;
|
g_tensor_split[id] /= total_vram;
|
||||||
|
@ -5247,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||||
if (src->backend == GGML_BACKEND_CPU) {
|
if (src->backend == GGML_BACKEND_CPU) {
|
||||||
kind = cudaMemcpyHostToDevice;
|
kind = cudaMemcpyHostToDevice;
|
||||||
src_ptr = (char *) src->data;
|
src_ptr = (char *) src->data;
|
||||||
} else if (src->backend == GGML_BACKEND_GPU) {
|
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||||
kind = cudaMemcpyDeviceToDevice;
|
kind = cudaMemcpyDeviceToDevice;
|
||||||
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||||
int id;
|
int id;
|
||||||
|
@ -5289,9 +5645,7 @@ inline void ggml_cuda_op_add(
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
@ -5452,14 +5806,41 @@ inline void ggml_cuda_op_mul_mat_q(
|
||||||
}
|
}
|
||||||
|
|
||||||
static int64_t get_row_rounding(ggml_type type) {
|
static int64_t get_row_rounding(ggml_type type) {
|
||||||
int max_compute_capability = INT_MIN;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
int64_t max_compute_capability = INT_MIN;
|
||||||
if (max_compute_capability < g_compute_capabilities[id]
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||||
max_compute_capability = g_compute_capabilities[id];
|
if (min_compute_capability > g_compute_capabilities[id]) {
|
||||||
|
min_compute_capability = g_compute_capabilities[id];
|
||||||
|
}
|
||||||
|
if (max_compute_capability < g_compute_capabilities[id]) {
|
||||||
|
max_compute_capability = g_compute_capabilities[id];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
switch(type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
return 1;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
#else
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
|
@ -5480,6 +5861,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_cuda_op_mul_mat_vec_q(
|
inline void ggml_cuda_op_mul_mat_vec_q(
|
||||||
|
@ -5631,10 +6013,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
const int64_t ne0 = dst->ne[0];
|
const int64_t ne0 = dst->ne[0];
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
float * src0_ddq_as_f32;
|
||||||
size_t src0_as;
|
size_t src0_as = 0;
|
||||||
float * src0_ddf_i = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as);
|
|
||||||
to_fp32_cuda(src0_dd_i, src0_ddf_i, row_diff*ne00, stream);
|
if (src0->type != GGML_TYPE_F32) {
|
||||||
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
||||||
|
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
||||||
|
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
||||||
|
}
|
||||||
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
@ -5651,10 +6038,11 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
src1_ddf_i, ne10,
|
src1_ddf_i, ne10,
|
||||||
&beta, dst_dd_i, ldc));
|
&beta, dst_dd_i, ldc));
|
||||||
|
|
||||||
ggml_cuda_pool_free(src0_ddf_i, src0_as);
|
if (src0_as > 0) {
|
||||||
|
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
||||||
|
}
|
||||||
|
|
||||||
(void) dst;
|
(void) dst;
|
||||||
(void) src0_dd_i;
|
|
||||||
(void) src1_ddq_i;
|
(void) src1_ddq_i;
|
||||||
(void) src1_padded_row_size;
|
(void) src1_padded_row_size;
|
||||||
}
|
}
|
||||||
|
@ -5793,7 +6181,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
const bool use_src1 = src1 != nullptr;
|
const bool use_src1 = src1 != nullptr;
|
||||||
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
||||||
|
|
||||||
GGML_ASSERT( src0->backend != GGML_BACKEND_GPU_SPLIT);
|
|
||||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||||
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||||
|
|
||||||
|
@ -5801,7 +6188,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU;
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
||||||
|
|
||||||
|
|
33
ggml-metal.m
33
ggml-metal.m
|
@ -69,6 +69,7 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(soft_max_4);
|
GGML_METAL_DECL_KERNEL(soft_max_4);
|
||||||
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
||||||
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
|
||||||
|
GGML_METAL_DECL_KERNEL(get_rows_f32);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||||
|
@ -177,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
ctx->n_buffers = 0;
|
ctx->n_buffers = 0;
|
||||||
ctx->concur_list_len = 0;
|
ctx->concur_list_len = 0;
|
||||||
|
|
||||||
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
#ifdef GGML_SWIFT
|
#ifdef GGML_SWIFT
|
||||||
// load the default.metallib file
|
// load the default.metallib file
|
||||||
|
@ -256,6 +257,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(soft_max_4);
|
GGML_METAL_ADD_KERNEL(soft_max_4);
|
||||||
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
||||||
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
|
||||||
|
GGML_METAL_ADD_KERNEL(get_rows_f32);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||||
|
@ -325,7 +327,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(gelu);
|
GGML_METAL_DEL_KERNEL(gelu);
|
||||||
GGML_METAL_DEL_KERNEL(soft_max);
|
GGML_METAL_DEL_KERNEL(soft_max);
|
||||||
GGML_METAL_DEL_KERNEL(soft_max_4);
|
GGML_METAL_DEL_KERNEL(soft_max_4);
|
||||||
|
GGML_METAL_DEL_KERNEL(diag_mask_inf);
|
||||||
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
|
||||||
|
GGML_METAL_DEL_KERNEL(get_rows_f32);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
GGML_METAL_DEL_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
|
||||||
|
@ -418,6 +422,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||||
|
|
||||||
|
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
||||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
||||||
*offs = (size_t) ioffs;
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
|
@ -755,6 +760,7 @@ void ggml_metal_graph_compute(
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||||
|
|
||||||
// utilize float4
|
// utilize float4
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
|
@ -762,6 +768,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
if (ggml_nelements(src1) == ne10) {
|
if (ggml_nelements(src1) == ne10) {
|
||||||
// src1 is a row
|
// src1 is a row
|
||||||
|
GGML_ASSERT(ne11 == 1);
|
||||||
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
||||||
} else {
|
} else {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_add];
|
[encoder setComputePipelineState:ctx->pipeline_add];
|
||||||
|
@ -778,6 +785,7 @@ void ggml_metal_graph_compute(
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||||
|
|
||||||
// utilize float4
|
// utilize float4
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
|
@ -785,6 +793,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
if (ggml_nelements(src1) == ne10) {
|
if (ggml_nelements(src1) == ne10) {
|
||||||
// src1 is a row
|
// src1 is a row
|
||||||
|
GGML_ASSERT(ne11 == 1);
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
[encoder setComputePipelineState:ctx->pipeline_mul_row];
|
||||||
} else {
|
} else {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul];
|
[encoder setComputePipelineState:ctx->pipeline_mul];
|
||||||
|
@ -800,6 +809,8 @@ void ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
{
|
{
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
const float scale = *(const float *) src1->data;
|
const float scale = *(const float *) src1->data;
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_scale];
|
[encoder setComputePipelineState:ctx->pipeline_scale];
|
||||||
|
@ -899,8 +910,8 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||||
if (ggml_is_contiguous(src0) &&
|
if (!ggml_is_transposed(src0) &&
|
||||||
ggml_is_contiguous(src1) &&
|
!ggml_is_transposed(src1) &&
|
||||||
src1t == GGML_TYPE_F32 &&
|
src1t == GGML_TYPE_F32 &&
|
||||||
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||||
ne00%32 == 0 &&
|
ne00%32 == 0 &&
|
||||||
|
@ -925,9 +936,12 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
||||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
||||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
||||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
|
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
|
||||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
|
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
|
||||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
|
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
|
||||||
|
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
|
||||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||||
} else {
|
} else {
|
||||||
|
@ -1077,6 +1091,7 @@ void ggml_metal_graph_compute(
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
{
|
{
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break;
|
||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||||
|
@ -1092,9 +1107,9 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
[encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||||
[encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
|
||||||
[encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
|
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(src1);
|
const int64_t n = ggml_nelements(src1);
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ kernel void kernel_add_row(
|
||||||
device const float4 * src0,
|
device const float4 * src0,
|
||||||
device const float4 * src1,
|
device const float4 * src1,
|
||||||
device float4 * dst,
|
device float4 * dst,
|
||||||
constant int64_t & nb,
|
constant int64_t & nb,
|
||||||
uint tpig[[thread_position_in_grid]]) {
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
dst[tpig] = src0[tpig] + src1[tpig % nb];
|
||||||
}
|
}
|
||||||
|
@ -1321,7 +1321,6 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||||
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
|
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
kernel void kernel_mul_mat_q3_K_f32(
|
kernel void kernel_mul_mat_q3_K_f32(
|
||||||
|
@ -1865,6 +1864,15 @@ kernel void kernel_mul_mat_q6_K_f32(
|
||||||
|
|
||||||
//============================= templates and their specializations =============================
|
//============================= templates and their specializations =============================
|
||||||
|
|
||||||
|
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||||
|
template <typename type4x4>
|
||||||
|
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
||||||
|
float4x4 temp = *(((device float4x4 *)src));
|
||||||
|
for (int i = 0; i < 16; i++){
|
||||||
|
reg[i/4][i%4] = temp[i/4][i%4];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
||||||
half4x4 temp = *(((device half4x4 *)src));
|
half4x4 temp = *(((device half4x4 *)src));
|
||||||
|
@ -1875,7 +1883,6 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
||||||
|
|
||||||
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
||||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||||
const float d2 = d1 / 256.f;
|
const float d2 = d1 / 256.f;
|
||||||
|
@ -1887,12 +1894,10 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
|
||||||
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
|
||||||
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
|
||||||
|
|
||||||
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
||||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||||
const float d2 = d1 / 256.f;
|
const float d2 = d1 / 256.f;
|
||||||
|
@ -1964,7 +1969,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
||||||
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
||||||
|
@ -2110,22 +2114,25 @@ kernel void kernel_get_rows(
|
||||||
// each block_q contains 16*nl weights
|
// each block_q contains 16*nl weights
|
||||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
|
||||||
kernel void kernel_mul_mm(device const uchar * src0,
|
kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
device const float * src1,
|
device const uchar * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
constant int64_t & nb01,
|
constant int64_t & nb01,
|
||||||
constant int64_t & nb02,
|
constant int64_t & nb02,
|
||||||
constant int64_t & ne12,
|
constant int64_t & ne12,
|
||||||
constant int64_t & ne0,
|
constant int64_t & nb10,
|
||||||
constant int64_t & ne1,
|
constant int64_t & nb11,
|
||||||
constant uint & gqa,
|
constant int64_t & nb12,
|
||||||
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
constant int64_t & ne0,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
constant int64_t & ne1,
|
||||||
uint tiitg[[thread_index_in_threadgroup]],
|
constant uint & gqa,
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
threadgroup uchar * shared_memory [[threadgroup(0)]],
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint tiitg[[thread_index_in_threadgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
|
||||||
threadgroup half * sa = ((threadgroup half *)shared_memory);
|
threadgroup half * sa = (threadgroup half *)(shared_memory);
|
||||||
threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
|
threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
|
||||||
|
|
||||||
const uint r0 = tgpig.y;
|
const uint r0 = tgpig.y;
|
||||||
|
@ -2138,7 +2145,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||||
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||||
|
|
||||||
simdgroup_half8x8 ma[4];
|
simdgroup_half8x8 ma[4];
|
||||||
simdgroup_float8x8 mb[2];
|
simdgroup_float8x8 mb[2];
|
||||||
simdgroup_float8x8 c_res[8];
|
simdgroup_float8x8 c_res[8];
|
||||||
for (int i = 0; i < 8; i++){
|
for (int i = 0; i < 8; i++){
|
||||||
|
@ -2146,10 +2153,15 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
}
|
}
|
||||||
|
|
||||||
short il = (tiitg % THREAD_PER_ROW);
|
short il = (tiitg % THREAD_PER_ROW);
|
||||||
uint offset0 = im/gqa*nb02; ushort offset1 = il/nl;
|
|
||||||
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
|
uint offset0 = im/gqa*nb02;
|
||||||
device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \
|
ushort offset1 = il/nl;
|
||||||
+ BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1;
|
|
||||||
|
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
|
||||||
|
device const float * y = (device const float *)(src1
|
||||||
|
+ nb12 * im
|
||||||
|
+ nb11 * (r1 * BLOCK_SIZE_N + thread_col)
|
||||||
|
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
||||||
|
|
||||||
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
||||||
//load data and store to threadgroup memory
|
//load data and store to threadgroup memory
|
||||||
|
@ -2229,6 +2241,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
|
typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
|
||||||
constant uint64_t &, constant uint64_t &, uint, uint, uint);
|
constant uint64_t &, constant uint64_t &, uint, uint, uint);
|
||||||
|
|
||||||
|
template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
|
||||||
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
||||||
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
||||||
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
||||||
|
@ -2239,14 +2252,27 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
|
||||||
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
|
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||||
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||||
|
|
||||||
typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\
|
typedef void (mat_mm_t)(
|
||||||
constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \
|
device const uchar * src0,
|
||||||
constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint);
|
device const uchar * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & nb01,
|
||||||
|
constant int64_t & nb02,
|
||||||
|
constant int64_t & ne12,
|
||||||
|
constant int64_t & nb10,
|
||||||
|
constant int64_t & nb11,
|
||||||
|
constant int64_t & nb12,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant uint & gqa,
|
||||||
|
threadgroup uchar *, uint3, uint, uint);
|
||||||
|
|
||||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
||||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
||||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
||||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
||||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||||
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
|
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||||
|
|
22
ggml.c
22
ggml.c
|
@ -283,7 +283,7 @@ typedef double ggml_float;
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
// on Arm, we use __fp16
|
// on Arm, we use __fp16
|
||||||
// on x86, we use uint16_t
|
// on x86, we use uint16_t
|
||||||
#ifdef __ARM_NEON
|
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
|
|
||||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||||
//
|
//
|
||||||
|
@ -4303,10 +4303,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||||
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
|
size_t nbytes;
|
||||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
size_t blck_size = ggml_blck_size(tensor->type);
|
||||||
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
if (blck_size == 1) {
|
||||||
|
nbytes = ggml_type_size(tensor->type);
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nbytes;
|
return nbytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18340,7 +18351,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
||||||
i,
|
i,
|
||||||
node->ne[0], node->ne[1],
|
node->ne[0], node->ne[1],
|
||||||
ggml_op_name(node->op));
|
ggml_op_name(node->op),
|
||||||
|
ggml_get_name(node));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -270,7 +270,7 @@ extern "C" {
|
||||||
|
|
||||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||||
typedef half ggml_fp16_t;
|
typedef half ggml_fp16_t;
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
typedef __fp16 ggml_fp16_t;
|
typedef __fp16 ggml_fp16_t;
|
||||||
#else
|
#else
|
||||||
typedef uint16_t ggml_fp16_t;
|
typedef uint16_t ggml_fp16_t;
|
||||||
|
|
|
@ -79,6 +79,7 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
LLAMA : int = auto()
|
LLAMA : int = auto()
|
||||||
FALCON : int = auto()
|
FALCON : int = auto()
|
||||||
|
BAICHUAN:int = auto()
|
||||||
GPT2 : int = auto()
|
GPT2 : int = auto()
|
||||||
GPTJ : int = auto()
|
GPTJ : int = auto()
|
||||||
GPTNEOX: int = auto()
|
GPTNEOX: int = auto()
|
||||||
|
@ -108,6 +109,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.LLAMA: "llama",
|
MODEL_ARCH.LLAMA: "llama",
|
||||||
MODEL_ARCH.FALCON: "falcon",
|
MODEL_ARCH.FALCON: "falcon",
|
||||||
|
MODEL_ARCH.BAICHUAN:"baichuan",
|
||||||
MODEL_ARCH.GPT2: "gpt2",
|
MODEL_ARCH.GPT2: "gpt2",
|
||||||
MODEL_ARCH.GPTJ: "gptj",
|
MODEL_ARCH.GPTJ: "gptj",
|
||||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||||
|
@ -153,6 +155,22 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
},
|
},
|
||||||
|
MODEL_ARCH.BAICHUAN: {
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
|
},
|
||||||
MODEL_ARCH.GPT2: {
|
MODEL_ARCH.GPT2: {
|
||||||
# TODO
|
# TODO
|
||||||
},
|
},
|
||||||
|
@ -165,6 +183,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BAICHUAN: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -187,7 +209,7 @@ class TensorNameMap:
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||||
"output", # llama-pth
|
"output", # llama-pth
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -195,7 +217,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
"transformer.ln_f", # gpt2 falcon
|
"transformer.ln_f", # gpt2 falcon
|
||||||
"model.norm", # llama-hf
|
"model.norm", # llama-hf baichuan
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -311,6 +333,7 @@ class TensorNameMap:
|
||||||
tensor_name = tensor_names.get(tensor)
|
tensor_name = tensor_names.get(tensor)
|
||||||
if tensor_name is None:
|
if tensor_name is None:
|
||||||
continue
|
continue
|
||||||
|
mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
mapping[key] = (tensor, tensor_name)
|
mapping[key] = (tensor, tensor_name)
|
||||||
for bid in range(n_blocks):
|
for bid in range(n_blocks):
|
||||||
|
@ -319,11 +342,12 @@ class TensorNameMap:
|
||||||
if tensor_name is None:
|
if tensor_name is None:
|
||||||
continue
|
continue
|
||||||
tensor_name = tensor_name.format(bid = bid)
|
tensor_name = tensor_name.format(bid = bid)
|
||||||
|
mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
key = key.format(bid = bid)
|
key = key.format(bid = bid)
|
||||||
mapping[key] = (tensor, tensor_name)
|
mapping[key] = (tensor, tensor_name)
|
||||||
|
|
||||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
|
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||||
result = self.mapping.get(key)
|
result = self.mapping.get(key)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result
|
return result
|
||||||
|
@ -334,13 +358,13 @@ class TensorNameMap:
|
||||||
return (result[0], result[1] + suffix)
|
return (result[0], result[1] + suffix)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
|
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
|
||||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||||
if result is None:
|
if result is None:
|
||||||
return None
|
return None
|
||||||
return result[1]
|
return result[1]
|
||||||
|
|
||||||
def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
|
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
||||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||||
if result is None:
|
if result is None:
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.3.2"
|
version = "0.3.3"
|
||||||
description = "Write ML models in GGUF for GGML"
|
description = "Write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
|
@ -2609,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
memcpy(utmp, x[i].scales, 12);
|
memcpy(utmp, x[i].scales, 12);
|
||||||
|
|
||||||
const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)};
|
uint32x2_t mins8 = { 0 };
|
||||||
|
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
|
||||||
|
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
|
||||||
|
|
||||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||||
utmp[0] &= kmask1;
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
|
|
742
llama.cpp
742
llama.cpp
|
@ -158,6 +158,7 @@ static std::string format(const char * fmt, ...) {
|
||||||
enum llm_arch {
|
enum llm_arch {
|
||||||
LLM_ARCH_LLAMA,
|
LLM_ARCH_LLAMA,
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
|
LLM_ARCH_BAICHUAN,
|
||||||
LLM_ARCH_GPT2,
|
LLM_ARCH_GPT2,
|
||||||
LLM_ARCH_GPTJ,
|
LLM_ARCH_GPTJ,
|
||||||
LLM_ARCH_GPTNEOX,
|
LLM_ARCH_GPTNEOX,
|
||||||
|
@ -172,6 +173,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_GPTJ, "gptj" },
|
{ LLM_ARCH_GPTJ, "gptj" },
|
||||||
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
||||||
{ LLM_ARCH_MPT, "mpt" },
|
{ LLM_ARCH_MPT, "mpt" },
|
||||||
|
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llm_kv {
|
enum llm_kv {
|
||||||
|
@ -312,6 +314,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_BAICHUAN,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_FALCON,
|
LLM_ARCH_FALCON,
|
||||||
{
|
{
|
||||||
|
@ -1686,6 +1707,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BAICHUAN:
|
||||||
|
{
|
||||||
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1926,7 +1956,6 @@ static void llm_load_tensors(
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
const auto tn = LLM_TN(model.arch);
|
const auto tn = LLM_TN(model.arch);
|
||||||
|
|
||||||
switch (model.arch) {
|
switch (model.arch) {
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
{
|
{
|
||||||
|
@ -1969,6 +1998,72 @@ static void llm_load_tensors(
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||||
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||||
|
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights +=
|
||||||
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||||
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
||||||
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_BAICHUAN:
|
||||||
|
{
|
||||||
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
{
|
||||||
|
ggml_backend backend_norm;
|
||||||
|
ggml_backend backend_output;
|
||||||
|
|
||||||
|
if (n_gpu_layers > int(n_layer)) {
|
||||||
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||||
|
// on Windows however this is detrimental unless everything is on the GPU
|
||||||
|
#ifndef _WIN32
|
||||||
|
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#else
|
||||||
|
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||||
|
} else {
|
||||||
|
backend_norm = GGML_BACKEND_CPU;
|
||||||
|
backend_output = GGML_BACKEND_CPU;
|
||||||
|
}
|
||||||
|
|
||||||
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||||
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
|
}
|
||||||
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
vram_weights += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_ff = hparams.n_ff;
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
@ -2545,6 +2640,367 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static struct ggml_cgraph * llm_build_baichaun(
|
||||||
|
llama_context & lctx,
|
||||||
|
const llama_token * tokens,
|
||||||
|
const float * embd,
|
||||||
|
int n_tokens,
|
||||||
|
int n_past) {
|
||||||
|
|
||||||
|
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
||||||
|
|
||||||
|
const int N = n_tokens;
|
||||||
|
|
||||||
|
const auto & model = lctx.model;
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
|
GGML_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
const int64_t n_layer = hparams.n_layer;
|
||||||
|
const int64_t n_ctx = hparams.n_ctx;
|
||||||
|
const int64_t n_head = hparams.n_head;
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv;
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head();
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
const float freq_base = hparams.rope_freq_base;
|
||||||
|
const float freq_scale = hparams.rope_freq_scale;
|
||||||
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
||||||
|
|
||||||
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
|
|
||||||
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ buf_compute.size,
|
||||||
|
/*.mem_buffer =*/ buf_compute.data,
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
|
};
|
||||||
|
|
||||||
|
params.no_alloc = true;
|
||||||
|
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
if (tokens) {
|
||||||
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
|
|
||||||
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||||
|
}
|
||||||
|
ggml_set_name(inp_tokens, "inp_tokens");
|
||||||
|
|
||||||
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||||
|
} else {
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
GGML_ASSERT(false && "not implemented");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
||||||
|
|
||||||
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
(void) i_gpu_start;
|
||||||
|
|
||||||
|
// offload functions set the tensor output backend to GPU
|
||||||
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
||||||
|
//
|
||||||
|
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
||||||
|
// in that case ggml_cuda_assign_buffers has no effect
|
||||||
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||||
|
offload_func_t offload_func_kq = llama_nop;
|
||||||
|
offload_func_t offload_func_v = llama_nop;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_gpu_layers > n_layer) {
|
||||||
|
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 1) {
|
||||||
|
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
if (n_gpu_layers > n_layer + 2) {
|
||||||
|
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||||
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
|
}
|
||||||
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
||||||
|
|
||||||
|
offload_func_t offload_func = llama_nop;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (il >= i_gpu_start) {
|
||||||
|
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "rms_norm_0");
|
||||||
|
|
||||||
|
// cur = cur*attn_norm(broadcasted)
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "attention_norm_0");
|
||||||
|
}
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
offload_func_kq(tmpk);
|
||||||
|
ggml_set_name(tmpk, "tmpk");
|
||||||
|
|
||||||
|
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
offload_func_kq(tmpq);
|
||||||
|
ggml_set_name(tmpq, "tmpq");
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur;
|
||||||
|
struct ggml_tensor * Qcur;
|
||||||
|
switch (model.type) {
|
||||||
|
case MODEL_7B:
|
||||||
|
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
||||||
|
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
||||||
|
break;
|
||||||
|
case MODEL_13B:
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
offload_func_kq(Kcur);
|
||||||
|
ggml_set_name(Kcur, "Kcur");
|
||||||
|
|
||||||
|
offload_func_kq(Qcur);
|
||||||
|
ggml_set_name(Qcur, "Qcur");
|
||||||
|
|
||||||
|
// store key and value to memory
|
||||||
|
{
|
||||||
|
// compute the transposed [N, n_embd] V matrix
|
||||||
|
|
||||||
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
offload_func_v(tmpv);
|
||||||
|
ggml_set_name(tmpv, "tmpv");
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
||||||
|
offload_func_v(Vcur);
|
||||||
|
ggml_set_name(Vcur, "Vcur");
|
||||||
|
|
||||||
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
||||||
|
offload_func_kq(k);
|
||||||
|
ggml_set_name(k, "k");
|
||||||
|
|
||||||
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
||||||
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
||||||
|
offload_func_v(v);
|
||||||
|
ggml_set_name(v, "v");
|
||||||
|
|
||||||
|
// important: storing RoPE-ed version of K in the KV cache!
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||||
|
offload_func_kq(Q);
|
||||||
|
ggml_set_name(Q, "Q");
|
||||||
|
|
||||||
|
struct ggml_tensor * K =
|
||||||
|
ggml_view_3d(ctx0, kv_self.k,
|
||||||
|
n_embd_head, n_past + N, n_head_kv,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_head,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
||||||
|
offload_func_kq(K);
|
||||||
|
ggml_set_name(K, "K");
|
||||||
|
|
||||||
|
// K * Q
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
offload_func_kq(KQ);
|
||||||
|
ggml_set_name(KQ, "KQ");
|
||||||
|
|
||||||
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||||
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||||
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
|
offload_func_kq(KQ_scaled);
|
||||||
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||||
|
|
||||||
|
struct ggml_tensor * KQ_masked;
|
||||||
|
struct ggml_tensor * KQ_scaled_alibi;
|
||||||
|
|
||||||
|
switch (model.type) {
|
||||||
|
case MODEL_7B:
|
||||||
|
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
|
break;
|
||||||
|
case MODEL_13B:
|
||||||
|
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
|
||||||
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
||||||
|
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
|
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
|
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
||||||
|
// offload_func_kq(KQ_masked);
|
||||||
|
// ggml_set_name(KQ_masked, "KQ_masked");
|
||||||
|
|
||||||
|
// KQ = soft_max(KQ_masked)
|
||||||
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
|
offload_func_v(KQ_soft_max);
|
||||||
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
|
// split cached V into n_head heads
|
||||||
|
struct ggml_tensor * V =
|
||||||
|
ggml_view_3d(ctx0, kv_self.v,
|
||||||
|
n_past + N, n_embd_head, n_head_kv,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
||||||
|
offload_func_v(V);
|
||||||
|
ggml_set_name(V, "V");
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
offload_func_v(KQV);
|
||||||
|
ggml_set_name(KQV, "KQV");
|
||||||
|
#else
|
||||||
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
||||||
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
||||||
|
// is there a better way?
|
||||||
|
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
offload_func_v(KQV_merged);
|
||||||
|
ggml_set_name(KQV_merged, "KQV_merged");
|
||||||
|
|
||||||
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||||
|
cur = ggml_cpy(ctx0,
|
||||||
|
KQV_merged,
|
||||||
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
offload_func_v(cur);
|
||||||
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||||
|
|
||||||
|
// projection (no bias)
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].wo,
|
||||||
|
cur);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "result_wo");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
||||||
|
offload_func(inpFF);
|
||||||
|
ggml_set_name(inpFF, "inpFF");
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
{
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "rms_norm_1");
|
||||||
|
|
||||||
|
// cur = cur*ffn_norm(broadcasted)
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "ffn_norm");
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].w3,
|
||||||
|
cur);
|
||||||
|
offload_func(tmp);
|
||||||
|
ggml_set_name(tmp, "result_w3");
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].w1,
|
||||||
|
cur);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "result_w1");
|
||||||
|
|
||||||
|
// SILU activation
|
||||||
|
cur = ggml_silu(ctx0, cur);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "silu");
|
||||||
|
|
||||||
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "silu_x_result_w3");
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].w2,
|
||||||
|
cur);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "result_w2");
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, inpFF);
|
||||||
|
offload_func(cur);
|
||||||
|
ggml_set_name(cur, "inpFF_+_result_w2");
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
||||||
|
offload_func_nr(cur);
|
||||||
|
ggml_set_name(cur, "rms_norm_2");
|
||||||
|
|
||||||
|
// cur = cur*norm(broadcasted)
|
||||||
|
cur = ggml_mul(ctx0, cur, model.output_norm);
|
||||||
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
||||||
|
ggml_set_name(cur, "result_norm");
|
||||||
|
}
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
ggml_set_name(cur, "result_output");
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_cgraph * llm_build_falcon(
|
static struct ggml_cgraph * llm_build_falcon(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
|
@ -2867,6 +3323,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BAICHUAN:
|
||||||
|
{
|
||||||
|
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
|
||||||
|
} break;
|
||||||
case LLM_ARCH_FALCON:
|
case LLM_ARCH_FALCON:
|
||||||
{
|
{
|
||||||
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
||||||
|
@ -2972,10 +3432,6 @@ static bool llama_eval_internal(
|
||||||
if (lctx.ctx_metal) {
|
if (lctx.ctx_metal) {
|
||||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
|
||||||
if (!lctx.embedding.empty()) {
|
|
||||||
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
||||||
}
|
}
|
||||||
|
@ -3124,10 +3580,9 @@ struct llm_tokenizer_spm {
|
||||||
while (offs < text.size()) {
|
while (offs < text.size()) {
|
||||||
llm_symbol sym;
|
llm_symbol sym;
|
||||||
size_t len = utf8_len(text[offs]);
|
size_t len = utf8_len(text[offs]);
|
||||||
GGML_ASSERT(offs + len <= text.size());
|
|
||||||
sym.text = text.c_str() + offs;
|
sym.text = text.c_str() + offs;
|
||||||
sym.n = len;
|
sym.n = std::min(len, text.size() - offs);
|
||||||
offs += len;
|
offs += sym.n;
|
||||||
sym.prev = index - 1;
|
sym.prev = index - 1;
|
||||||
sym.next = offs == text.size() ? -1 : index + 1;
|
sym.next = offs == text.size() ? -1 : index + 1;
|
||||||
index++;
|
index++;
|
||||||
|
@ -4643,7 +5098,16 @@ void llama_beam_search(llama_context * ctx,
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
|
template <typename T>
|
||||||
|
struct no_init {
|
||||||
|
T value;
|
||||||
|
no_init() { /* do nothing */ }
|
||||||
|
};
|
||||||
|
|
||||||
|
static void llama_convert_tensor_internal(
|
||||||
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||||
|
const size_t nelements, const int nthread
|
||||||
|
) {
|
||||||
if (output.size() < nelements) {
|
if (output.size() < nelements) {
|
||||||
output.resize(nelements);
|
output.resize(nelements);
|
||||||
}
|
}
|
||||||
|
@ -4678,7 +5142,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
||||||
auto blocks_per_thread = nblocks / nthread;
|
auto blocks_per_thread = nblocks / nthread;
|
||||||
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
||||||
|
|
||||||
std::vector<std::thread> workers;
|
|
||||||
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
||||||
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
||||||
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
||||||
|
@ -4691,15 +5154,124 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
||||||
qtype.to_float(inbuf, outbuf, nels);
|
qtype.to_float(inbuf, outbuf, nels);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
||||||
in_buff_offs += thr_block_bytes;
|
in_buff_offs += thr_block_bytes;
|
||||||
out_buff_offs += thr_elems;
|
out_buff_offs += thr_elems;
|
||||||
}
|
}
|
||||||
for (auto & worker : workers) {
|
for (auto & w : workers) { w.join(); }
|
||||||
worker.join();
|
workers.clear();
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_K_QUANTS
|
||||||
|
static ggml_type get_k_quant_type(
|
||||||
|
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
||||||
|
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
||||||
|
) {
|
||||||
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
|
const auto tn = LLM_TN(model.arch);
|
||||||
|
|
||||||
|
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
||||||
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
|
int nx = tensor->ne[0];
|
||||||
|
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||||
|
new_type = GGML_TYPE_Q8_0;
|
||||||
|
}
|
||||||
|
else if (new_type != GGML_TYPE_Q8_0) {
|
||||||
|
new_type = GGML_TYPE_Q6_K;
|
||||||
|
}
|
||||||
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||||
|
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
|
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||||
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||||
|
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
if (model.type == MODEL_70B) {
|
||||||
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||||
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||||
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||||
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||||
|
}
|
||||||
|
++*i_attention_wv;
|
||||||
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||||
|
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
||||||
|
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
||||||
|
: GGML_TYPE_Q3_K;
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||||
|
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||||
|
if (model.arch == LLM_ARCH_FALCON) {
|
||||||
|
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
||||||
|
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||||
|
} else {
|
||||||
|
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
||||||
|
new_type = GGML_TYPE_Q5_K;
|
||||||
|
}
|
||||||
|
++*i_feed_forward_w2;
|
||||||
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||||
|
if (model.arch != LLM_ARCH_FALCON) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
|
} else {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
||||||
|
}
|
||||||
|
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
|
}
|
||||||
|
// This can be used to reduce the size of the Q5_K_S model.
|
||||||
|
// The associated PPL increase is fully in line with the size reduction
|
||||||
|
//else {
|
||||||
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||||
|
//}
|
||||||
|
bool convert_incompatible_tensor = false;
|
||||||
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||||
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||||
|
int nx = tensor->ne[0];
|
||||||
|
int ny = tensor->ne[1];
|
||||||
|
if (nx % QK_K != 0) {
|
||||||
|
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
||||||
|
convert_incompatible_tensor = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (convert_incompatible_tensor) {
|
||||||
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
||||||
|
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
||||||
|
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
||||||
|
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
||||||
|
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("Unsupported tensor size encountered\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new_type;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||||
ggml_type quantized_type;
|
ggml_type quantized_type;
|
||||||
llama_ftype ftype = params->ftype;
|
llama_ftype ftype = params->ftype;
|
||||||
|
@ -4783,18 +5355,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||||
|
|
||||||
std::vector<std::thread> workers;
|
std::vector<std::thread> workers;
|
||||||
|
workers.reserve(nthread);
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
|
||||||
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
|
||||||
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
|
|
||||||
std::vector<uint8_t> read_data;
|
std::vector<no_init<uint8_t>> read_data;
|
||||||
std::vector<uint8_t> work;
|
std::vector<no_init<uint8_t>> work;
|
||||||
|
std::vector<no_init<float>> f32_conv_buf;
|
||||||
|
|
||||||
// populate the original tensors so we get an initial meta data
|
// populate the original tensors so we get an initial meta data
|
||||||
for (int i = 0; i < ml->n_tensors; ++i) {
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
||||||
|
@ -4816,7 +5384,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
read_data.resize(ggml_nbytes(tensor));
|
if (read_data.size() < ggml_nbytes(tensor)) {
|
||||||
|
read_data.resize(ggml_nbytes(tensor));
|
||||||
|
}
|
||||||
tensor->data = read_data.data();
|
tensor->data = read_data.data();
|
||||||
ml->load_data_for(tensor);
|
ml->load_data_for(tensor);
|
||||||
|
|
||||||
|
@ -4841,101 +5411,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
if (quantize) {
|
if (quantize) {
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
new_type = get_k_quant_type(
|
||||||
const auto tn = LLM_TN(ml->get_arch());
|
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
||||||
|
);
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
|
||||||
int nx = tensor->ne[0];
|
|
||||||
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
|
||||||
new_type = GGML_TYPE_Q8_0;
|
|
||||||
}
|
|
||||||
else if (new_type != GGML_TYPE_Q8_0) {
|
|
||||||
new_type = GGML_TYPE_Q6_K;
|
|
||||||
}
|
|
||||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
||||||
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
|
||||||
}
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
|
||||||
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
|
||||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
|
||||||
if (model.type == MODEL_70B) {
|
|
||||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
|
||||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
|
||||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
|
||||||
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
|
||||||
}
|
|
||||||
++i_attention_wv;
|
|
||||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
||||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
|
||||||
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
|
||||||
: GGML_TYPE_Q3_K;
|
|
||||||
}
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
|
||||||
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
|
||||||
}
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
|
||||||
if (model.arch == LLM_ARCH_FALCON) {
|
|
||||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
|
||||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
|
||||||
} else {
|
|
||||||
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
|
||||||
new_type = GGML_TYPE_Q5_K;
|
|
||||||
}
|
|
||||||
++i_feed_forward_w2;
|
|
||||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
|
||||||
if (model.arch != LLM_ARCH_FALCON) {
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
||||||
} else {
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
|
||||||
}
|
|
||||||
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
||||||
}
|
|
||||||
// This can be used to reduce the size of the Q5_K_S model.
|
|
||||||
// The associated PPL increase is fully in line with the size reduction
|
|
||||||
//else {
|
|
||||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
|
||||||
//}
|
|
||||||
bool convert_incompatible_tensor = false;
|
|
||||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
|
||||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
|
||||||
int nx = tensor->ne[0];
|
|
||||||
int ny = tensor->ne[1];
|
|
||||||
if (nx % QK_K != 0) {
|
|
||||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
|
||||||
convert_incompatible_tensor = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (convert_incompatible_tensor) {
|
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
|
||||||
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
|
||||||
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
|
||||||
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
|
||||||
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
|
||||||
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
|
||||||
} else {
|
|
||||||
throw std::runtime_error("Unsupported tensor size encountered\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
// If we've decided to quantize to the same type the tensor is already
|
// If we've decided to quantize to the same type the tensor is already
|
||||||
// in then there's nothing to do.
|
// in then there's nothing to do.
|
||||||
|
@ -4950,23 +5428,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
const size_t nelements = ggml_nelements(tensor);
|
const size_t nelements = ggml_nelements(tensor);
|
||||||
|
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
std::vector<float> f32_conv_buf;
|
|
||||||
|
|
||||||
if (tensor->type == GGML_TYPE_F32) {
|
if (tensor->type == GGML_TYPE_F32) {
|
||||||
f32_data = (float *) tensor->data;
|
f32_data = (float *) tensor->data;
|
||||||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||||
} else {
|
} else {
|
||||||
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||||
f32_data = (float *) f32_conv_buf.data();
|
f32_data = (float *) f32_conv_buf.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
work.resize(nelements * 4); // upper bound on size
|
if (work.size() < nelements * 4) {
|
||||||
|
work.resize(nelements * 4); // upper bound on size
|
||||||
|
}
|
||||||
new_data = work.data();
|
new_data = work.data();
|
||||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
std::array<int64_t, 1 << 4> hist_cur = {};
|
||||||
|
|
||||||
static const int chunk_size = 32 * 512;
|
static const int chunk_size = 32 * 512;
|
||||||
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
||||||
|
@ -4977,13 +5456,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
size_t counter = 0;
|
size_t counter = 0;
|
||||||
new_size = 0;
|
new_size = 0;
|
||||||
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
||||||
std::vector<int64_t> local_hist;
|
std::array<int64_t, 1 << 4> local_hist = {};
|
||||||
size_t local_size = 0;
|
size_t local_size = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
std::unique_lock<std::mutex> lock(mutex);
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
size_t first = counter; counter += chunk_size;
|
size_t first = counter; counter += chunk_size;
|
||||||
if (first >= nelements) {
|
if (first >= nelements) {
|
||||||
if (!local_hist.empty()) {
|
if (local_size > 0) {
|
||||||
for (int j=0; j<int(local_hist.size()); ++j) {
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
||||||
hist_cur[j] += local_hist[j];
|
hist_cur[j] += local_hist[j];
|
||||||
}
|
}
|
||||||
|
@ -4993,22 +5472,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
size_t last = std::min(nelements, first + chunk_size);
|
size_t last = std::min(nelements, first + chunk_size);
|
||||||
if (local_hist.empty()) {
|
|
||||||
local_hist.resize(hist_cur.size(), 0);
|
|
||||||
}
|
|
||||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
if ((int) workers.size() < nthread_use - 1) {
|
|
||||||
workers.resize(nthread_use - 1);
|
|
||||||
}
|
|
||||||
for (int it = 0; it < nthread_use - 1; ++it) {
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
||||||
workers[it] = std::thread(compute);
|
workers.emplace_back(compute);
|
||||||
}
|
}
|
||||||
compute();
|
compute();
|
||||||
for (int it = 0; it < nthread_use - 1; ++it) {
|
for (auto & w : workers) { w.join(); }
|
||||||
workers[it].join();
|
workers.clear();
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||||
|
@ -6222,7 +6694,7 @@ int llama_tokenize_with_model(
|
||||||
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
||||||
|
|
||||||
if (n_max_tokens < (int) res.size()) {
|
if (n_max_tokens < (int) res.size()) {
|
||||||
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||||
return -((int) res.size());
|
return -((int) res.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
4
prompts/chat-with-baichuan.txt
Normal file
4
prompts/chat-with-baichuan.txt
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
以下内容为人类用户与与一位智能助手的对话。
|
||||||
|
|
||||||
|
用户:你好!
|
||||||
|
助手:
|
|
@ -13,7 +13,7 @@ CLI_ARGS_MAIN_PERPLEXITY = [
|
||||||
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
|
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
|
||||||
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
|
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
|
||||||
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
|
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
|
||||||
"model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
|
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
|
||||||
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
|
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
|
||||||
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
|
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
|
||||||
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
|
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
|
||||||
|
|
69
scripts/LlamaConfig.cmake.in
Normal file
69
scripts/LlamaConfig.cmake.in
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
||||||
|
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||||
|
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||||
|
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
|
set(LLAMA_BLAS @LLAMA_BLAS@)
|
||||||
|
set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
|
||||||
|
set(LLAMA_METAL @LLAMA_METAL@)
|
||||||
|
set(LLAMA_MPI @LLAMA_MPI@)
|
||||||
|
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
||||||
|
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
||||||
|
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
||||||
|
|
||||||
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
|
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||||
|
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||||
|
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||||
|
|
||||||
|
# Ensure transient dependencies satisfied
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
if (APPLE AND LLAMA_ACCELERATE)
|
||||||
|
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_BLAS)
|
||||||
|
find_package(BLAS REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUBLAS)
|
||||||
|
find_package(CUDAToolkit REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_METAL)
|
||||||
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_MPI)
|
||||||
|
find_package(MPI REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CLBLAST)
|
||||||
|
find_package(CLBlast REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_HIPBLAS)
|
||||||
|
find_package(hip REQUIRED)
|
||||||
|
find_package(hipblas REQUIRED)
|
||||||
|
find_package(rocblas REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_library(llama_LIBRARY llama
|
||||||
|
REQUIRED
|
||||||
|
HINTS ${LLAMA_LIB_DIR})
|
||||||
|
|
||||||
|
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
||||||
|
add_library(llama UNKNOWN IMPORTED)
|
||||||
|
set_target_properties(llama
|
||||||
|
PROPERTIES
|
||||||
|
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||||
|
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
||||||
|
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||||
|
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||||
|
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||||
|
POSITION_INDEPENDENT_CODE ON )
|
||||||
|
|
||||||
|
check_required_components(Llama)
|
|
@ -29,9 +29,8 @@ llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||||
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_build_executable(test-tokenizer-1.cpp)
|
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||||
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
||||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "console.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -89,6 +90,12 @@ int main(int argc, char **argv) {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
// We need this for unicode console support
|
||||||
|
console::init(false, false);
|
||||||
|
atexit([]() { console::cleanup(); });
|
||||||
|
#endif
|
||||||
|
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
|
||||||
for (const auto & test_kv : k_tests()) {
|
for (const auto & test_kv : k_tests()) {
|
||||||
|
|
127
tests/test-tokenizer-1-llama.cpp
Normal file
127
tests/test-tokenizer-1-llama.cpp
Normal file
|
@ -0,0 +1,127 @@
|
||||||
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "console.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <string>
|
||||||
|
#include <codecvt>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <locale>
|
||||||
|
|
||||||
|
typedef int codepoint;
|
||||||
|
|
||||||
|
std::string codepoint_to_utf8(codepoint cp) {
|
||||||
|
std::string result;
|
||||||
|
if (0x00 <= cp && cp <= 0x7f) {
|
||||||
|
result.push_back(cp);
|
||||||
|
} else if (0x80 <= cp && cp <= 0x7ff) {
|
||||||
|
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
} else if (0x800 <= cp && cp <= 0xffff) {
|
||||||
|
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||||
|
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
} else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||||
|
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||||
|
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||||
|
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument("invalid codepoint");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 2) {
|
||||||
|
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string fname = argv[1];
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
|
||||||
|
llama_backend_init(false);
|
||||||
|
|
||||||
|
// load the vocab
|
||||||
|
{
|
||||||
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
|
lparams.vocab_only = true;
|
||||||
|
|
||||||
|
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM);
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
// We need this for unicode console support
|
||||||
|
console::init(false, false);
|
||||||
|
atexit([]() { console::cleanup(); });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
|
std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
|
if (check != str) {
|
||||||
|
fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
|
||||||
|
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
||||||
|
if(i != 3)
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
|
||||||
|
if (cp < 0xd800 || cp > 0xdfff) {
|
||||||
|
std::string str = codepoint_to_utf8(cp);
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
|
if (str != check) {
|
||||||
|
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
|
||||||
|
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
|
if(cp != 0 && cp != 9601)
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||||
|
std::string str = codepoint_to_utf8(cp);
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
|
if (str != check) {
|
||||||
|
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
|
||||||
|
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -1,108 +0,0 @@
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <string>
|
|
||||||
#include <codecvt>
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <locale>
|
|
||||||
|
|
||||||
static std::string escape_whitespace(const std::string& text) {
|
|
||||||
std::string result = "\xe2\x96\x81";
|
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
|
||||||
if (text[offs] == ' ') {
|
|
||||||
result += "\xe2\x96\x81";
|
|
||||||
} else {
|
|
||||||
result += text[offs];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
if (argc < 2) {
|
|
||||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string fname = argv[1];
|
|
||||||
|
|
||||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
llama_backend_init(false);
|
|
||||||
|
|
||||||
// load the vocab
|
|
||||||
{
|
|
||||||
auto lparams = llama_context_default_params();
|
|
||||||
|
|
||||||
lparams.vocab_only = true;
|
|
||||||
|
|
||||||
model = llama_load_model_from_file(fname.c_str(), lparams);
|
|
||||||
|
|
||||||
if (model == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx = llama_new_context_with_model(model, lparams);
|
|
||||||
|
|
||||||
if (ctx == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
|
||||||
llama_free_model(model);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
|
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_vocab; ++i) {
|
|
||||||
std::string forward = llama_token_to_piece(ctx, i);
|
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
|
||||||
if (tokens.size() == 1) {
|
|
||||||
if (i != tokens[0]) {
|
|
||||||
std::string backward = llama_token_to_piece(ctx, tokens[0]);
|
|
||||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
|
|
||||||
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
|
|
||||||
for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
|
|
||||||
std::u16string u16str(1, ch);
|
|
||||||
std::string str = u16converter.to_bytes(u16str);
|
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
|
|
||||||
if (tokens.size() == 1) {
|
|
||||||
fprintf(stderr, "%s : info: %s tokenized to %d \n",
|
|
||||||
__func__, str.c_str(), tokens[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
|
|
||||||
for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
|
|
||||||
std::u32string u32str(1, ch);
|
|
||||||
std::string str = u32converter.to_bytes(u32str);
|
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
|
|
||||||
if (tokens.size() == 1) {
|
|
||||||
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
llama_free_model(model);
|
|
||||||
llama_free(ctx);
|
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
Loading…
Add table
Add a link
Reference in a new issue