Merge branch 'master' into context-sensitive-help
This commit is contained in:
commit
f6e92a84a2
67 changed files with 6476 additions and 1716 deletions
43
.github/workflows/build.yml
vendored
43
.github/workflows/build.yml
vendored
|
@ -10,10 +10,10 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
@ -188,7 +188,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
|
cmake ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -253,6 +253,29 @@ jobs:
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
|
macOS-latest-swift:
|
||||||
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
|
||||||
|
- name: xcodebuild for swift package
|
||||||
|
id: xcodebuild
|
||||||
|
run: |
|
||||||
|
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
@ -265,17 +288,17 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'noavx'
|
- build: 'noavx'
|
||||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx2'
|
- build: 'avx2'
|
||||||
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx'
|
- build: 'avx'
|
||||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512'
|
- build: 'avx512'
|
||||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'clblast'
|
- build: 'clblast'
|
||||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||||
- build: 'openblas'
|
- build: 'openblas'
|
||||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -414,7 +437,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
|
|
3
.github/workflows/gguf-publish.yml
vendored
3
.github/workflows/gguf-publish.yml
vendored
|
@ -36,8 +36,9 @@ jobs:
|
||||||
poetry install
|
poetry install
|
||||||
|
|
||||||
- name: Build package
|
- name: Build package
|
||||||
run: poetry build
|
run: cd gguf-py && poetry build
|
||||||
- name: Publish package
|
- name: Publish package
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
with:
|
with:
|
||||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||||
|
packages-dir: gguf-py/dist
|
||||||
|
|
25
.github/workflows/zig-build.yml
vendored
Normal file
25
.github/workflows/zig-build.yml
vendored
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
name: Zig CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
runs-on: [ubuntu-latest, macos-latest, windows-latest]
|
||||||
|
runs-on: ${{ matrix.runs-on }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
fetch-depth: 0
|
||||||
|
- uses: goto-bus-stop/setup-zig@v2
|
||||||
|
with:
|
||||||
|
version: 0.11.0
|
||||||
|
- name: Build Summary
|
||||||
|
run: zig build --summary all -freference-trace
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -10,6 +10,7 @@
|
||||||
*.gcno
|
*.gcno
|
||||||
*.gcda
|
*.gcda
|
||||||
*.dot
|
*.dot
|
||||||
|
*.metallib
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.build/
|
.build/
|
||||||
.cache/
|
.cache/
|
||||||
|
@ -91,4 +92,5 @@ tests/test-quantize-perf
|
||||||
tests/test-sampling
|
tests/test-sampling
|
||||||
tests/test-tokenizer-0-llama
|
tests/test-tokenizer-0-llama
|
||||||
tests/test-tokenizer-0-falcon
|
tests/test-tokenizer-0-falcon
|
||||||
tests/test-tokenizer-1
|
tests/test-tokenizer-1-llama
|
||||||
|
tests/test-tokenizer-1-bpe
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
|
cmake_minimum_required(VERSION 3.13) # for add_link_options
|
||||||
project("llama.cpp" C CXX)
|
project("llama.cpp" C CXX)
|
||||||
|
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
@ -44,7 +44,7 @@ endif()
|
||||||
|
|
||||||
# general
|
# general
|
||||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||||
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
|
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
||||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||||
|
|
||||||
# debug
|
# debug
|
||||||
|
@ -58,15 +58,21 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer"
|
||||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
# instruction set specific
|
# instruction set specific
|
||||||
option(LLAMA_AVX "llama: enable AVX" ON)
|
if (LLAMA_NATIVE)
|
||||||
option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
set(INS_ENB OFF)
|
||||||
|
else()
|
||||||
|
set(INS_ENB ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||||
|
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||||
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
|
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
|
||||||
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
|
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
|
||||||
option(LLAMA_FMA "llama: enable FMA" ON)
|
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
|
||||||
# in MSVC F16C is implied with AVX2/AVX512
|
# in MSVC F16C is implied with AVX2/AVX512
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
option(LLAMA_F16C "llama: enable F16C" ON)
|
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
|
@ -343,8 +349,9 @@ if (LLAMA_MPI)
|
||||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
||||||
add_compile_definitions(GGML_USE_MPI)
|
add_compile_definitions(GGML_USE_MPI)
|
||||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||||
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
|
if (NOT MSVC)
|
||||||
set(c_flags ${c_flags} -Wno-cast-qual)
|
add_compile_options(-Wno-cast-qual)
|
||||||
|
endif()
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
|
||||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
|
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
|
||||||
# Even if you're only using the C header, C++ programs may bring in MPI
|
# Even if you're only using the C header, C++ programs may bring in MPI
|
||||||
|
@ -418,10 +425,11 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
|
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
|
||||||
-Werror=implicit-function-declaration)
|
-Werror=implicit-function-declaration)
|
||||||
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
||||||
|
set(host_cxx_flags "")
|
||||||
|
|
||||||
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||||
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
||||||
set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
||||||
|
@ -431,27 +439,38 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
endif()
|
endif()
|
||||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||||
set(c_flags ${c_flags} -Wdouble-promotion)
|
set(c_flags ${c_flags} -Wdouble-promotion)
|
||||||
set(cxx_flags ${cxx_flags} -Wno-array-bounds)
|
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
|
||||||
|
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
||||||
set(cxx_flags ${cxx_flags} -Wno-format-truncation)
|
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
|
||||||
endif()
|
endif()
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
||||||
set(cxx_flags ${cxx_flags} -Wextra-semi)
|
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
# todo : msvc
|
# todo : msvc
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_compile_options(
|
set(c_flags ${c_flags} ${warning_flags})
|
||||||
${warning_flags}
|
set(cxx_flags ${cxx_flags} ${warning_flags})
|
||||||
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
|
||||||
)
|
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (NOT MSVC)
|
||||||
|
set(cuda_flags -Wno-pedantic)
|
||||||
|
endif()
|
||||||
|
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
|
||||||
|
|
||||||
|
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
|
||||||
|
if (NOT cuda_host_flags STREQUAL "")
|
||||||
|
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||||
|
|
||||||
|
@ -491,9 +510,6 @@ if (NOT MSVC)
|
||||||
if (LLAMA_GPROF)
|
if (LLAMA_GPROF)
|
||||||
add_compile_options(-pg)
|
add_compile_options(-pg)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_NATIVE)
|
|
||||||
add_compile_options(-march=native)
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||||
|
@ -548,6 +564,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
|
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
|
if (LLAMA_NATIVE)
|
||||||
|
add_compile_options(-march=native)
|
||||||
|
endif()
|
||||||
if (LLAMA_F16C)
|
if (LLAMA_F16C)
|
||||||
add_compile_options(-mf16c)
|
add_compile_options(-mf16c)
|
||||||
endif()
|
endif()
|
||||||
|
@ -644,6 +663,8 @@ add_library(ggml OBJECT
|
||||||
ggml.h
|
ggml.h
|
||||||
ggml-alloc.c
|
ggml-alloc.c
|
||||||
ggml-alloc.h
|
ggml-alloc.h
|
||||||
|
ggml-backend.c
|
||||||
|
ggml-backend.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
|
|
16
Makefile
16
Makefile
|
@ -2,7 +2,7 @@
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
|
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -62,9 +62,11 @@ test: $(TEST_TARGETS)
|
||||||
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
|
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
||||||
continue; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||||
continue; \
|
continue; \
|
||||||
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
||||||
|
continue; \
|
||||||
else \
|
else \
|
||||||
echo "Running test $$test_target..."; \
|
echo "Running test $$test_target..."; \
|
||||||
./$$test_target; \
|
./$$test_target; \
|
||||||
|
@ -510,9 +512,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||||
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o
|
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
|
OBJS += ggml-alloc.o ggml-backend.o
|
||||||
|
|
||||||
|
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common.o: common/common.cpp common/common.h build-info.h common/log.h
|
common.o: common/common.cpp common/common.h build-info.h common/log.h
|
||||||
|
@ -673,6 +678,9 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -10,15 +10,18 @@ let platforms: [SupportedPlatform]? = [
|
||||||
.tvOS(.v14)
|
.tvOS(.v14)
|
||||||
]
|
]
|
||||||
let exclude: [String] = []
|
let exclude: [String] = []
|
||||||
let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
|
let resources: [Resource] = [
|
||||||
|
.process("ggml-metal.metal")
|
||||||
|
]
|
||||||
|
let additionalSources: [String] = ["ggml-metal.m"]
|
||||||
let additionalSettings: [CSetting] = [
|
let additionalSettings: [CSetting] = [
|
||||||
.unsafeFlags(["-fno-objc-arc"]),
|
.unsafeFlags(["-fno-objc-arc"]),
|
||||||
.define("GGML_SWIFT"),
|
|
||||||
.define("GGML_USE_METAL")
|
.define("GGML_USE_METAL")
|
||||||
]
|
]
|
||||||
#else
|
#else
|
||||||
let platforms: [SupportedPlatform]? = nil
|
let platforms: [SupportedPlatform]? = nil
|
||||||
let exclude: [String] = ["ggml-metal.metal"]
|
let exclude: [String] = ["ggml-metal.metal"]
|
||||||
|
let resources: [Resource] = []
|
||||||
let additionalSources: [String] = []
|
let additionalSources: [String] = []
|
||||||
let additionalSettings: [CSetting] = []
|
let additionalSettings: [CSetting] = []
|
||||||
#endif
|
#endif
|
||||||
|
@ -40,13 +43,17 @@ let package = Package(
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
"k_quants.c",
|
"k_quants.c",
|
||||||
] + additionalSources,
|
] + additionalSources,
|
||||||
|
resources: resources,
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
||||||
.define("GGML_USE_K_QUANTS"),
|
.define("GGML_USE_K_QUANTS"),
|
||||||
.define("GGML_USE_ACCELERATE"),
|
.define("GGML_USE_ACCELERATE")
|
||||||
.define("ACCELERATE_NEW_LAPACK"),
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
.define("ACCELERATE_LAPACK_ILP64")
|
// We should consider add this in the future when we drop support for iOS 14
|
||||||
|
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||||
|
// .define("ACCELERATE_NEW_LAPACK"),
|
||||||
|
// .define("ACCELERATE_LAPACK_ILP64")
|
||||||
] + additionalSettings,
|
] + additionalSettings,
|
||||||
linkerSettings: [
|
linkerSettings: [
|
||||||
.linkedFramework("Accelerate")
|
.linkedFramework("Accelerate")
|
||||||
|
|
29
README.md
29
README.md
|
@ -5,7 +5,7 @@
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
[](https://github.com/ggerganov/llama.cpp/actions)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
|
@ -95,6 +95,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
|
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
|
||||||
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
||||||
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||||
|
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
|
@ -377,7 +378,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
- #### cuBLAS
|
- #### cuBLAS
|
||||||
|
|
||||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_CUBLAS=1
|
make LLAMA_CUBLAS=1
|
||||||
|
@ -613,6 +614,18 @@ For more information, see [https://huggingface.co/docs/transformers/perplexity](
|
||||||
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
|
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
|
||||||
The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
|
The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
|
||||||
|
|
||||||
|
#### How to run
|
||||||
|
|
||||||
|
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||||
|
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||||
|
3. Output:
|
||||||
|
```
|
||||||
|
perplexity : calculating perplexity over 655 chunks
|
||||||
|
24.43 seconds per pass - ETA 4.45 hours
|
||||||
|
[1]4.5970,[2]5.1807,[3]6.0382,...
|
||||||
|
```
|
||||||
|
And after 4.45 hours, you will have the final perplexity.
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
|
||||||
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
|
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
|
||||||
|
@ -775,18 +788,6 @@ If your issue is with model generation quality, then please at least scan the fo
|
||||||
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
||||||
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
||||||
|
|
||||||
#### How to run
|
|
||||||
|
|
||||||
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
|
||||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
|
||||||
3. Output:
|
|
||||||
```
|
|
||||||
perplexity : calculating perplexity over 655 chunks
|
|
||||||
24.43 seconds per pass - ETA 4.45 hours
|
|
||||||
[1]4.5970,[2]5.1807,[3]6.0382,...
|
|
||||||
```
|
|
||||||
And after 4.45 hours, you will have the final perplexity.
|
|
||||||
|
|
||||||
### Android
|
### Android
|
||||||
|
|
||||||
#### Building the Project using Android NDK
|
#### Building the Project using Android NDK
|
||||||
|
|
42
build.zig
42
build.zig
|
@ -36,14 +36,17 @@ const Maker = struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn init(builder: *std.build.Builder) !Maker {
|
fn init(builder: *std.build.Builder) !Maker {
|
||||||
// const commit_hash = @embedFile(".git/refs/heads/master");
|
|
||||||
const target = builder.standardTargetOptions(.{});
|
const target = builder.standardTargetOptions(.{});
|
||||||
|
const zig_version = @import("builtin").zig_version_string;
|
||||||
|
const commit_hash = try std.ChildProcess.exec(
|
||||||
|
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||||
|
);
|
||||||
const config_header = builder.addConfigHeader(
|
const config_header = builder.addConfigHeader(
|
||||||
.{ .style = .blank, .include_path = "build-info.h" },
|
.{ .style = .blank, .include_path = "build-info.h" },
|
||||||
.{
|
.{
|
||||||
.BUILD_NUMBER = 0,
|
.BUILD_NUMBER = 0,
|
||||||
.BUILD_COMMIT = "12345", // omit newline
|
.BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
|
||||||
.BUILD_COMPILER = "Zig 0.11.0",
|
.BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
|
||||||
.BUILD_TARGET = try target.allocDescription(builder.allocator),
|
.BUILD_TARGET = try target.allocDescription(builder.allocator),
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
@ -67,13 +70,21 @@ const Maker = struct {
|
||||||
|
|
||||||
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
||||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||||
|
if (o.target.getAbi() != .msvc)
|
||||||
|
o.defineCMacro("_GNU_SOURCE", null);
|
||||||
|
o.addConfigHeader(m.config_header);
|
||||||
if (std.mem.endsWith(u8, src, ".c")) {
|
if (std.mem.endsWith(u8, src, ".c")) {
|
||||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||||
o.linkLibC();
|
o.linkLibC();
|
||||||
} else {
|
} else {
|
||||||
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||||
|
if (o.target.getAbi() == .msvc) {
|
||||||
|
o.linkLibC(); // need winsdk + crt
|
||||||
|
} else {
|
||||||
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||||
o.linkLibCpp();
|
o.linkLibCpp();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
o.addConfigHeader(m.config_header);
|
o.addConfigHeader(m.config_header);
|
||||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||||
o.want_lto = m.enable_lto;
|
o.want_lto = m.enable_lto;
|
||||||
|
@ -86,8 +97,14 @@ const Maker = struct {
|
||||||
for (deps) |d| e.addObject(d);
|
for (deps) |d| e.addObject(d);
|
||||||
for (m.objs.items) |o| e.addObject(o);
|
for (m.objs.items) |o| e.addObject(o);
|
||||||
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
||||||
e.linkLibC();
|
|
||||||
|
// https://github.com/ziglang/zig/issues/15448
|
||||||
|
if (e.target.getAbi() == .msvc) {
|
||||||
|
e.linkLibC(); // need winsdk + crt
|
||||||
|
} else {
|
||||||
|
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||||
e.linkLibCpp();
|
e.linkLibCpp();
|
||||||
|
}
|
||||||
e.addConfigHeader(m.config_header);
|
e.addConfigHeader(m.config_header);
|
||||||
m.builder.installArtifact(e);
|
m.builder.installArtifact(e);
|
||||||
e.want_lto = m.enable_lto;
|
e.want_lto = m.enable_lto;
|
||||||
|
@ -107,18 +124,21 @@ pub fn build(b: *std.build.Builder) !void {
|
||||||
|
|
||||||
const ggml = make.obj("ggml", "ggml.c");
|
const ggml = make.obj("ggml", "ggml.c");
|
||||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||||
|
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
const llama = make.obj("llama", "llama.cpp");
|
||||||
const common = make.obj("common", "common/common.cpp");
|
const common = make.obj("common", "common/common.cpp");
|
||||||
const console = make.obj("common", "common/console.cpp");
|
const console = make.obj("console", "common/console.cpp");
|
||||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||||
|
const train = make.obj("train", "common/train.cpp");
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
|
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser });
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
|
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
|
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
|
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
|
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||||
|
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
|
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser });
|
||||||
if (server.target.isWindows()) {
|
if (server.target.isWindows()) {
|
||||||
server.linkSystemLibrary("ws2_32");
|
server.linkSystemLibrary("ws2_32");
|
||||||
}
|
}
|
||||||
|
|
|
@ -167,8 +167,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
// store the external file name in params
|
||||||
|
params.prompt_file = argv[i];
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||||
if (params.prompt.back() == '\n') {
|
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
||||||
params.prompt.pop_back();
|
params.prompt.pop_back();
|
||||||
}
|
}
|
||||||
} else if (arg == "-n" || arg == "--n-predict") {
|
} else if (arg == "-n" || arg == "--n-predict") {
|
||||||
|
@ -293,7 +295,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
|
||||||
if (params.cfg_negative_prompt.back() == '\n') {
|
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
|
||||||
params.cfg_negative_prompt.pop_back();
|
params.cfg_negative_prompt.pop_back();
|
||||||
}
|
}
|
||||||
} else if (arg == "--cfg-scale") {
|
} else if (arg == "--cfg-scale") {
|
||||||
|
@ -361,7 +363,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_adapter.push_back({argv[i], 1.0f});
|
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--lora-scaled") {
|
} else if (arg == "--lora-scaled") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -373,7 +375,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
|
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--lora-base") {
|
} else if (arg == "--lora-base") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -616,6 +618,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
process_escapes(params.prompt);
|
process_escapes(params.prompt);
|
||||||
process_escapes(params.input_prefix);
|
process_escapes(params.input_prefix);
|
||||||
process_escapes(params.input_suffix);
|
process_escapes(params.input_suffix);
|
||||||
|
for (auto & antiprompt : params.antiprompt) {
|
||||||
|
process_escapes(antiprompt);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -937,6 +942,7 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
||||||
result += piece;
|
result += piece;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1030,10 +1036,11 @@ llama_token llama_sample_token(
|
||||||
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||||
} else {
|
} else {
|
||||||
// Temperature sampling
|
// Temperature sampling
|
||||||
llama_sample_top_k (ctx, &cur_p, top_k, 1);
|
size_t min_keep = std::max(1, params.n_probs);
|
||||||
llama_sample_tail_free (ctx, &cur_p, tfs_z, 1);
|
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
||||||
llama_sample_typical (ctx, &cur_p, typical_p, 1);
|
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
||||||
llama_sample_top_p (ctx, &cur_p, top_p, 1);
|
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
||||||
|
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|
|
@ -80,7 +80,7 @@ struct gpt_params {
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string prompt = ""; // user-provided single prompt
|
std::string prompt = ""; // user-provided single prompt
|
||||||
std::string prompt_file = ""; // store the external prompt file
|
std::string prompt_file = ""; // store the external prompt file name
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with
|
std::string input_suffix = ""; // string to suffix user inputs with
|
||||||
|
|
|
@ -11,11 +11,14 @@ import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
import itertools
|
import itertools
|
||||||
import gguf
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
||||||
|
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing import TypeAlias
|
from typing import TypeAlias
|
||||||
|
@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
|
||||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
||||||
|
vocab_size = hparams.get('vocab_size')
|
||||||
|
if vocab_size is None:
|
||||||
|
vocab_size = tokenizer.vocab_size()
|
||||||
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(vocab_size):
|
||||||
text: bytes
|
text: bytes
|
||||||
score: float
|
score: float
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import contextlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import struct
|
import struct
|
||||||
|
@ -20,32 +21,10 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
def bytes_to_unicode():
|
def count_model_parts(dir_model: Path, prefix: str) -> int:
|
||||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
|
||||||
"""
|
|
||||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
|
||||||
The reversible bpe codes work on unicode strings.
|
|
||||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
|
||||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
|
||||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
|
||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
|
||||||
"""
|
|
||||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
|
||||||
cs = bs[:]
|
|
||||||
n = 0
|
|
||||||
for b in range(2**8):
|
|
||||||
if b not in bs:
|
|
||||||
bs.append(b)
|
|
||||||
cs.append(2**8+n)
|
|
||||||
n += 1
|
|
||||||
return dict(zip(bs, (chr(n) for n in cs)))
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
|
||||||
num_parts = 0
|
num_parts = 0
|
||||||
for filename in os.listdir(dir_model):
|
for filename in os.listdir(dir_model):
|
||||||
if filename.startswith("pytorch_model-"):
|
if filename.startswith(prefix):
|
||||||
num_parts += 1
|
num_parts += 1
|
||||||
|
|
||||||
if num_parts > 0:
|
if num_parts > 0:
|
||||||
|
@ -99,20 +78,26 @@ print("gguf: loading model "+dir_model.name)
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
hparams = json.load(f)
|
hparams = json.load(f)
|
||||||
|
|
||||||
if hparams["architectures"][0] != "RWForCausalLM":
|
if hparams["architectures"][0] != "FalconForCausalLM":
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||||
|
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# get number of model parts
|
# get number of model parts
|
||||||
num_parts = count_model_parts(dir_model)
|
num_parts = count_model_parts(dir_model, "model-00")
|
||||||
|
if num_parts:
|
||||||
|
is_safetensors = True
|
||||||
|
from safetensors import safe_open
|
||||||
|
else:
|
||||||
|
is_safetensors = False
|
||||||
|
num_parts = count_model_parts(dir_model, "pytorch_model-")
|
||||||
|
|
||||||
ARCH=gguf.MODEL_ARCH.FALCON
|
ARCH=gguf.MODEL_ARCH.FALCON
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
print("gguf: get model metadata")
|
||||||
|
|
||||||
block_count = hparams["n_layer"]
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
|
||||||
gguf_writer.add_name("Falcon")
|
gguf_writer.add_name("Falcon")
|
||||||
gguf_writer.add_context_length(2048) # not in config.json
|
gguf_writer.add_context_length(2048) # not in config.json
|
||||||
|
@ -120,9 +105,9 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||||
gguf_writer.add_block_count(block_count)
|
gguf_writer.add_block_count(block_count)
|
||||||
gguf_writer.add_head_count(hparams["n_head"])
|
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
if "n_head_kv" in hparams:
|
if "num_kv_heads" in hparams:
|
||||||
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
|
||||||
else:
|
else:
|
||||||
gguf_writer.add_head_count_kv(1)
|
gguf_writer.add_head_count_kv(1)
|
||||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||||
|
@ -133,50 +118,32 @@ gguf_writer.add_file_type(ftype)
|
||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
|
scores: list[float] = []
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
toktypes: list[int] = []
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
byte_encoder = bytes_to_unicode()
|
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i in reverse_vocab:
|
tokens.append(reverse_vocab[i])
|
||||||
try:
|
scores.append(0.0) # dummy
|
||||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
except KeyError:
|
|
||||||
text = bytearray()
|
|
||||||
for c in reverse_vocab[i]:
|
|
||||||
if ord(c) < 256: # single byte character
|
|
||||||
text.append(byte_decoder[ord(c)])
|
|
||||||
else: # multibyte special token character
|
|
||||||
text.extend(c.encode('utf-8'))
|
|
||||||
else:
|
|
||||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
|
||||||
pad_token = f"[PAD{i}]".encode("utf8")
|
|
||||||
text = bytearray(pad_token)
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
@ -186,8 +153,8 @@ special_vocab.add_to_gguf(gguf_writer)
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||||
|
|
||||||
# params for qkv transform
|
# params for qkv transform
|
||||||
n_head = hparams["n_head"]
|
n_head = hparams["num_attention_heads"]
|
||||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
|
||||||
|
|
||||||
head_dim = hparams["hidden_size"] // n_head
|
head_dim = hparams["hidden_size"] // n_head
|
||||||
|
|
||||||
|
@ -196,6 +163,10 @@ print("gguf: get tensor metadata")
|
||||||
|
|
||||||
if num_parts == 0:
|
if num_parts == 0:
|
||||||
part_names = iter(("pytorch_model.bin",))
|
part_names = iter(("pytorch_model.bin",))
|
||||||
|
elif is_safetensors:
|
||||||
|
part_names = (
|
||||||
|
f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
part_names = (
|
part_names = (
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||||
|
@ -205,10 +176,14 @@ for part_name in part_names:
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
break
|
break
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
print("gguf: loading model part '" + part_name + "'")
|
||||||
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
if is_safetensors:
|
||||||
|
ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
|
||||||
|
else:
|
||||||
|
ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
|
||||||
|
|
||||||
|
with ctx as model_part:
|
||||||
for name in model_part.keys():
|
for name in model_part.keys():
|
||||||
data = model_part[name]
|
data = model_part.get_tensor(name) if is_safetensors else model_part[name]
|
||||||
|
|
||||||
old_dtype = data.dtype
|
old_dtype = data.dtype
|
||||||
|
|
||||||
|
|
|
@ -19,29 +19,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
|
||||||
|
|
||||||
|
|
||||||
def bytes_to_unicode():
|
|
||||||
"""
|
|
||||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
|
||||||
The reversible bpe codes work on unicode strings.
|
|
||||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
|
||||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
|
||||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
|
||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
|
||||||
"""
|
|
||||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
|
||||||
cs = bs[:]
|
|
||||||
n = 0
|
|
||||||
for b in range(2**8):
|
|
||||||
if b not in bs:
|
|
||||||
bs.append(b)
|
|
||||||
cs.append(2**8+n)
|
|
||||||
n += 1
|
|
||||||
return dict(zip(bs, (chr(n) for n in cs)))
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
def count_model_parts(dir_model: Path) -> int:
|
||||||
num_parts = 0
|
num_parts = 0
|
||||||
|
@ -130,48 +107,32 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
|
||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
|
scores: list[float] = []
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
toktypes: list[int] = []
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
vocab_size = len(tokenizer_json["model"]["vocab"])
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
byte_encoder = bytes_to_unicode()
|
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i in reverse_vocab:
|
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||||
try:
|
scores.append(0.0) # dummy
|
||||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
except KeyError:
|
|
||||||
text = bytearray()
|
|
||||||
for c in reverse_vocab[i]:
|
|
||||||
if ord(c) < 256: # single byte character
|
|
||||||
text.append(byte_decoder[ord(c)])
|
|
||||||
else: # multibyte special token character
|
|
||||||
text.extend(c.encode('utf-8'))
|
|
||||||
else:
|
|
||||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
|
||||||
pad_token = f"[PAD{i}]".encode("utf8")
|
|
||||||
text = bytearray(pad_token)
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
130
convert-persimmon-to-gguf.py
Normal file
130
convert-persimmon-to-gguf.py
Normal file
|
@ -0,0 +1,130 @@
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
from pprint import pprint
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
def _flatten_dict(dct, tensors, prefix=None):
|
||||||
|
assert isinstance(dct, dict)
|
||||||
|
for key in dct.keys():
|
||||||
|
new_prefix = prefix + '.' + key if prefix is not None else key
|
||||||
|
if isinstance(dct[key], torch.Tensor):
|
||||||
|
tensors[new_prefix] = dct[key]
|
||||||
|
elif isinstance(dct[key], dict):
|
||||||
|
_flatten_dict(dct[key], tensors, new_prefix)
|
||||||
|
else:
|
||||||
|
raise ValueError(type(dct[key]))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
|
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||||
|
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||||
|
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||||
|
print('gguf: adding tokens')
|
||||||
|
tokens: list[bytes] = []
|
||||||
|
scores: list[float] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
for i in range(tokenizer.vocab_size()):
|
||||||
|
text: bytes
|
||||||
|
score: float
|
||||||
|
|
||||||
|
piece = tokenizer.id_to_piece(i)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.get_score(i)
|
||||||
|
|
||||||
|
toktype = 1
|
||||||
|
if tokenizer.is_unknown(i):
|
||||||
|
toktype = 2
|
||||||
|
if tokenizer.is_control(i):
|
||||||
|
toktype = 3
|
||||||
|
if tokenizer.is_unused(i):
|
||||||
|
toktype = 5
|
||||||
|
if tokenizer.is_byte(i):
|
||||||
|
toktype = 6
|
||||||
|
|
||||||
|
tokens.append(text)
|
||||||
|
scores.append(score)
|
||||||
|
toktypes.append(toktype)
|
||||||
|
pass
|
||||||
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
||||||
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
|
||||||
|
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
|
||||||
|
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
|
||||||
|
args = parser.parse_args()
|
||||||
|
sys.path.append(str(args.adept_inference_dir))
|
||||||
|
persimmon_model = torch.load(args.ckpt_path)
|
||||||
|
hparams = persimmon_model['args']
|
||||||
|
pprint(hparams)
|
||||||
|
tensors = {}
|
||||||
|
_flatten_dict(persimmon_model['model'], tensors, None)
|
||||||
|
|
||||||
|
arch = gguf.MODEL_ARCH.PERSIMMON
|
||||||
|
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
|
||||||
|
|
||||||
|
block_count = hparams.num_layers
|
||||||
|
head_count = hparams.num_attention_heads
|
||||||
|
head_count_kv = head_count
|
||||||
|
ctx_length = hparams.seq_length
|
||||||
|
hidden_size = hparams.hidden_size
|
||||||
|
|
||||||
|
gguf_writer.add_name('persimmon-8b-chat')
|
||||||
|
gguf_writer.add_context_length(ctx_length)
|
||||||
|
gguf_writer.add_embedding_length(hidden_size)
|
||||||
|
gguf_writer.add_block_count(block_count)
|
||||||
|
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
|
||||||
|
gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
||||||
|
gguf_writer.add_head_count(head_count)
|
||||||
|
gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
|
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
|
||||||
|
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
|
||||||
|
|
||||||
|
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
|
||||||
|
gguf_writer.add_tokenizer_model('llama')
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
gguf_writer.add_bos_token_id(71013)
|
||||||
|
gguf_writer.add_eos_token_id(71013)
|
||||||
|
|
||||||
|
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
||||||
|
print(tensor_map)
|
||||||
|
for name in tensors.keys():
|
||||||
|
data = tensors[name]
|
||||||
|
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
||||||
|
continue
|
||||||
|
old_dtype = data.dtype
|
||||||
|
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
||||||
|
data = data.to(torch.float32).squeeze().numpy()
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print("Can not map tensor '" + name + "'")
|
||||||
|
sys.exit()
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||||
|
gguf_writer.add_tensor(new_name, data)
|
||||||
|
print("gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print("gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
print("gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
print(f"gguf: model successfully exported to '{args.outfile}'")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
318
convert-refact-hf-to-gguf.py
Executable file
318
convert-refact-hf-to-gguf.py
Executable file
|
@ -0,0 +1,318 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# HF refact--> gguf conversion
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer # type: ignore[import]
|
||||||
|
|
||||||
|
if "NO_LOCAL_GGUF" not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
|
def bytes_to_unicode():
|
||||||
|
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||||
|
"""
|
||||||
|
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||||
|
The reversible bpe codes work on unicode strings.
|
||||||
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
|
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||||
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
|
"""
|
||||||
|
bs = (
|
||||||
|
list(range(ord("!"), ord("~") + 1))
|
||||||
|
+ list(range(ord("¡"), ord("¬") + 1))
|
||||||
|
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||||
|
)
|
||||||
|
cs = bs[:]
|
||||||
|
n = 0
|
||||||
|
for b in range(2**8):
|
||||||
|
if b not in bs:
|
||||||
|
bs.append(b)
|
||||||
|
cs.append(2**8 + n)
|
||||||
|
n += 1
|
||||||
|
return dict(zip(bs, (chr(n) for n in cs)))
|
||||||
|
|
||||||
|
|
||||||
|
def count_model_parts(dir_model: Path) -> int:
|
||||||
|
num_parts = 0
|
||||||
|
for filename in os.listdir(dir_model):
|
||||||
|
if filename.startswith("pytorch_model-"):
|
||||||
|
num_parts += 1
|
||||||
|
|
||||||
|
if num_parts > 0:
|
||||||
|
print("gguf: found " + str(num_parts) + " model parts")
|
||||||
|
return num_parts
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Convert a Refact model to a GGML compatible file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--vocab-only",
|
||||||
|
action="store_true",
|
||||||
|
help="extract only the vocab",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outfile",
|
||||||
|
type=Path,
|
||||||
|
help="path to write to; default: based on input",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"model",
|
||||||
|
type=Path,
|
||||||
|
help="directory containing model file, or model file itself (*.bin)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"ftype",
|
||||||
|
type=int,
|
||||||
|
choices=[0, 1],
|
||||||
|
default=1,
|
||||||
|
nargs="?",
|
||||||
|
help="output format - use 0 for float32, 1 for float16",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
dir_model = args.model
|
||||||
|
ftype = args.ftype
|
||||||
|
if not dir_model.is_dir():
|
||||||
|
print(f"Error: {args.model} is not a directory", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# possible tensor data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
if args.outfile is not None:
|
||||||
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
|
fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
|
||||||
|
|
||||||
|
print("gguf: loading model " + dir_model.name)
|
||||||
|
|
||||||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
|
||||||
|
if hparams["architectures"][0] != "GPTRefactForCausalLM":
|
||||||
|
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||||
|
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# get number of model parts
|
||||||
|
num_parts = count_model_parts(dir_model)
|
||||||
|
|
||||||
|
ARCH = gguf.MODEL_ARCH.REFACT
|
||||||
|
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
|
print("gguf: get model metadata")
|
||||||
|
|
||||||
|
# Get refact feed forward dimension
|
||||||
|
hidden_dim = hparams["n_embd"]
|
||||||
|
inner_dim = 4 * hidden_dim
|
||||||
|
hidden_dim = int(2 * inner_dim / 3)
|
||||||
|
multiple_of = 256
|
||||||
|
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
||||||
|
|
||||||
|
block_count = hparams["n_layer"]
|
||||||
|
|
||||||
|
gguf_writer.add_name("Refact")
|
||||||
|
# refact uses Alibi. So this is from config.json which might be used by training.
|
||||||
|
gguf_writer.add_context_length(hparams["n_positions"])
|
||||||
|
gguf_writer.add_embedding_length(hparams["n_embd"])
|
||||||
|
|
||||||
|
gguf_writer.add_feed_forward_length(ff_dim)
|
||||||
|
gguf_writer.add_block_count(block_count)
|
||||||
|
gguf_writer.add_head_count(hparams["n_head"])
|
||||||
|
gguf_writer.add_head_count_kv(1)
|
||||||
|
gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
|
||||||
|
gguf_writer.add_file_type(ftype)
|
||||||
|
|
||||||
|
# TOKENIZATION
|
||||||
|
|
||||||
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
|
tokens: list[bytearray] = []
|
||||||
|
scores: list[float] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
tokenizer_json_file = dir_model / "tokenizer.json"
|
||||||
|
if not tokenizer_json_file.is_file():
|
||||||
|
print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# gpt2 tokenizer
|
||||||
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
|
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = (
|
||||||
|
hparams["vocab_size"]
|
||||||
|
if "vocab_size" in hparams
|
||||||
|
else len(tokenizer_json["model"]["vocab"])
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||||
|
|
||||||
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
|
byte_encoder = bytes_to_unicode()
|
||||||
|
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i in reverse_vocab:
|
||||||
|
text = reverse_vocab[i]
|
||||||
|
try:
|
||||||
|
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||||
|
except KeyError:
|
||||||
|
text = bytearray()
|
||||||
|
for c in reverse_vocab[i]:
|
||||||
|
if ord(c) < 256: # single byte character
|
||||||
|
text.append(byte_decoder[ord(c)])
|
||||||
|
else: # multibyte special token character
|
||||||
|
text.extend(c.encode("utf-8"))
|
||||||
|
else:
|
||||||
|
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||||
|
pad_token = f"[PAD{i}]".encode("utf8")
|
||||||
|
text = bytearray(pad_token)
|
||||||
|
|
||||||
|
tokens.append(text)
|
||||||
|
scores.append(0.0) # dymmy
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||||
|
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||||
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
|
# TENSORS
|
||||||
|
|
||||||
|
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
||||||
|
|
||||||
|
# params for qkv transform
|
||||||
|
n_head = hparams["n_head"]
|
||||||
|
n_head_kv = 1
|
||||||
|
|
||||||
|
head_dim = hparams["n_embd"] // n_head
|
||||||
|
|
||||||
|
# tensor info
|
||||||
|
print("gguf: get tensor metadata")
|
||||||
|
|
||||||
|
if num_parts == 0:
|
||||||
|
part_names = iter(("pytorch_model.bin",))
|
||||||
|
else:
|
||||||
|
part_names = (
|
||||||
|
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||||
|
)
|
||||||
|
for part_name in part_names:
|
||||||
|
if args.vocab_only:
|
||||||
|
break
|
||||||
|
print("gguf: loading model part '" + part_name + "'")
|
||||||
|
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||||
|
|
||||||
|
for i in range(block_count):
|
||||||
|
if f"transformer.h.{i}.attn.kv.weight" in model_part:
|
||||||
|
data = model_part[f"transformer.h.{i}.attn.kv.weight"]
|
||||||
|
model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
|
||||||
|
: n_head_kv * head_dim
|
||||||
|
]
|
||||||
|
model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
|
||||||
|
n_head_kv * head_dim :
|
||||||
|
]
|
||||||
|
del model_part[f"transformer.h.{i}.attn.kv.weight"]
|
||||||
|
if f"transformer.h.{i}.attn.q.weight" in model_part:
|
||||||
|
model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
|
||||||
|
f"transformer.h.{i}.attn.q.weight"
|
||||||
|
]
|
||||||
|
del model_part[f"transformer.h.{i}.attn.q.weight"]
|
||||||
|
if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
|
||||||
|
data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
||||||
|
model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
|
||||||
|
model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
|
||||||
|
del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
||||||
|
|
||||||
|
for name in model_part.keys():
|
||||||
|
data = model_part[name]
|
||||||
|
|
||||||
|
old_dtype = data.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||||
|
data = data.to(torch.float32)
|
||||||
|
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
|
||||||
|
if new_name is None:
|
||||||
|
print("Can not map tensor '" + name + "'")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if (
|
||||||
|
ftype == 1
|
||||||
|
and data_dtype == np.float32
|
||||||
|
and name.endswith(".weight")
|
||||||
|
and n_dims == 2
|
||||||
|
):
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(
|
||||||
|
new_name
|
||||||
|
+ ", n_dims = "
|
||||||
|
+ str(n_dims)
|
||||||
|
+ ", "
|
||||||
|
+ str(old_dtype)
|
||||||
|
+ " --> "
|
||||||
|
+ str(data.dtype)
|
||||||
|
)
|
||||||
|
|
||||||
|
gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
print("gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print("gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
if not args.vocab_only:
|
||||||
|
print("gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||||
|
print("")
|
|
@ -20,28 +20,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
def bytes_to_unicode():
|
|
||||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
|
||||||
"""
|
|
||||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
|
||||||
The reversible bpe codes work on unicode strings.
|
|
||||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
|
||||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
|
||||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
|
||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
|
||||||
"""
|
|
||||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
|
||||||
cs = bs[:]
|
|
||||||
n = 0
|
|
||||||
for b in range(2**8):
|
|
||||||
if b not in bs:
|
|
||||||
bs.append(b)
|
|
||||||
cs.append(2**8+n)
|
|
||||||
n += 1
|
|
||||||
return dict(zip(bs, (chr(n) for n in cs)))
|
|
||||||
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: Path) -> int:
|
def count_model_parts(dir_model: Path) -> int:
|
||||||
num_parts = 0
|
num_parts = 0
|
||||||
for filename in os.listdir(dir_model):
|
for filename in os.listdir(dir_model):
|
||||||
|
@ -117,50 +95,32 @@ gguf_writer.add_file_type(ftype)
|
||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
|
scores: list[float] = []
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
toktypes: list[int] = []
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
|
||||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
|
||||||
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
|
|
||||||
|
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
byte_encoder = bytes_to_unicode()
|
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i in reverse_vocab:
|
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||||
try:
|
scores.append(0.0) # dummy
|
||||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
except KeyError:
|
|
||||||
text = bytearray()
|
|
||||||
for c in reverse_vocab[i]:
|
|
||||||
if ord(c) < 256: # single byte character
|
|
||||||
text.append(byte_decoder[ord(c)])
|
|
||||||
else: # multibyte special token character
|
|
||||||
text.extend(c.encode('utf-8'))
|
|
||||||
else:
|
|
||||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
|
||||||
pad_token = f"[PAD{i}]".encode("utf8")
|
|
||||||
text = bytearray(pad_token)
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
27
convert.py
27
convert.py
|
@ -42,7 +42,6 @@ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||||
|
|
||||||
ARCH = gguf.MODEL_ARCH.LLAMA
|
ARCH = gguf.MODEL_ARCH.LLAMA
|
||||||
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
|
|
||||||
|
|
||||||
DEFAULT_CONCURRENCY = 8
|
DEFAULT_CONCURRENCY = 8
|
||||||
#
|
#
|
||||||
|
@ -339,29 +338,15 @@ class BpeVocab:
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
tokenizer = self.bpe_tokenizer
|
||||||
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
||||||
byte_encoder = tokenization_gpt2.bytes_to_unicode()
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
|
||||||
score = 0.0
|
for i, _ in enumerate(tokenizer):
|
||||||
for i, item in enumerate(tokenizer):
|
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
||||||
text: bytes = item.encode("utf-8")
|
|
||||||
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
|
|
||||||
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
|
|
||||||
if i == 0 and text == b'<unk>':
|
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
|
||||||
elif i == 1 or i == 2:
|
|
||||||
toktype = gguf.TokenType.CONTROL
|
|
||||||
elif i >= 3 and text.startswith(b'<0x'):
|
|
||||||
toktype = gguf.TokenType.BYTE
|
|
||||||
else:
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
|
||||||
else:
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
|
||||||
yield text, score, toktype
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
for text in self.added_tokens_list:
|
for text in self.added_tokens_list:
|
||||||
score = -1000.0
|
score = -1000.0
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
yield from self.bpe_tokens()
|
yield from self.bpe_tokens()
|
||||||
|
@ -953,7 +938,7 @@ class OutputFile:
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
|
|
|
@ -30,7 +30,6 @@ else()
|
||||||
add_subdirectory(embd-input)
|
add_subdirectory(embd-input)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(beam-search)
|
add_subdirectory(beam-search)
|
||||||
add_subdirectory(cmap-example)
|
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
add_subdirectory(metal)
|
add_subdirectory(metal)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -9,7 +9,7 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
|
MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
|
||||||
PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
|
PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
|
||||||
USER_NAME="${USER_NAME:-User}"
|
USER_NAME="${USER_NAME:-User}"
|
||||||
AI_NAME="${AI_NAME:-ChatLLaMa}"
|
AI_NAME="${AI_NAME:-ChatLLaMa}"
|
||||||
|
@ -61,9 +61,9 @@ fi
|
||||||
|
|
||||||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||||
echo 'Prompt cache does not exist, building...'
|
echo 'Prompt cache does not exist, building...'
|
||||||
# Default batch_size to 8 here for better user feedback during initial prompt processing
|
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||||
./main 2>>"$LOG" \
|
./main 2>>"$LOG" \
|
||||||
--batch_size 8 \
|
--batch_size 64 \
|
||||||
"${OPTS[@]}" \
|
"${OPTS[@]}" \
|
||||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||||
--file "$CUR_PROMPT_FILE" \
|
--file "$CUR_PROMPT_FILE" \
|
||||||
|
|
|
@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
|
||||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
|
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
|
||||||
|
|
||||||
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
|
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
|
||||||
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
|
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
|
||||||
|
|
|
@ -313,7 +313,7 @@ class ModelParams:
|
||||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||||
|
|
||||||
def tensor_name(key, bid=None, suffix=".weight"):
|
def tensor_name(key, bid=None, suffix=".weight"):
|
||||||
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
|
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
|
||||||
|
|
||||||
class Layer:
|
class Layer:
|
||||||
def __init__(self, params, lora_params, bid):
|
def __init__(self, params, lora_params, bid):
|
||||||
|
|
|
@ -332,8 +332,8 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
|
||||||
|
|
||||||
assert_shape_1d(layer.attention_norm, hparams.n_embd);
|
assert_shape_1d(layer.attention_norm, hparams.n_embd);
|
||||||
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
|
||||||
assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa());
|
||||||
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
|
||||||
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
|
||||||
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
|
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
|
||||||
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
|
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
./build/bin/main --color --instruct --threads 4 \
|
./main --color --instruct --threads 4 \
|
||||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||||
--file ./prompts/alpaca.txt \
|
--file ./prompts/alpaca.txt \
|
||||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||||
|
|
|
@ -543,6 +543,9 @@ int main(int argc, char ** argv) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
embd.erase(embd.begin(), embd.begin() + i);
|
embd.erase(embd.begin(), embd.begin() + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove any "future" tokens that we might have inherited from the session from the KV cache
|
||||||
|
llama_kv_cache_tokens_rm(ctx, n_past, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate tokens in batches
|
// evaluate tokens in batches
|
||||||
|
@ -667,7 +670,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
// reset color to default if we there is no pending user input
|
// reset color to default if there is no pending user input
|
||||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||||
console::set_display(console::reset);
|
console::set_display(console::reset);
|
||||||
}
|
}
|
||||||
|
@ -694,10 +697,8 @@ int main(int argc, char ** argv) {
|
||||||
if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
|
if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
console::set_display(console::user_input);
|
|
||||||
}
|
}
|
||||||
is_antiprompt = true;
|
is_antiprompt = true;
|
||||||
fflush(stdout);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -721,8 +722,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
console::set_display(console::user_input);
|
|
||||||
fflush(stdout);
|
|
||||||
} else if (params.instruct) {
|
} else if (params.instruct) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
|
@ -747,6 +746,9 @@ int main(int argc, char ** argv) {
|
||||||
printf("%s", buffer.c_str());
|
printf("%s", buffer.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// color user input only
|
||||||
|
console::set_display(console::user_input);
|
||||||
|
|
||||||
std::string line;
|
std::string line;
|
||||||
bool another_line = true;
|
bool another_line = true;
|
||||||
do {
|
do {
|
||||||
|
|
|
@ -1,9 +1,3 @@
|
||||||
# llama.cpp/example/parallel
|
# llama.cpp/example/parallel
|
||||||
|
|
||||||
Simplified simulation for serving incoming requests in parallel
|
Simplified simluation for serving incoming requests in parallel
|
||||||
|
|
||||||
Running this using the 100 questions in examples/jeopardy/questions.txt
|
|
||||||
on an M2 MAX (38 core) with 32GB unified memory on MacOS Sonoma 14.0
|
|
||||||
takes about 235 seconds with sequential responses (-ns 1) and 45 seconds
|
|
||||||
with 64 parallel responses (-ns 64) in both cases generating 100 answers (-np 100)
|
|
||||||
using a context of 8192 (-c 8192).
|
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <ctime>
|
||||||
|
|
||||||
// trim whitespace from the beginning and end of a string
|
// trim whitespace from the beginning and end of a string
|
||||||
static std::string trim(const std::string & str) {
|
static std::string trim(const std::string & str) {
|
||||||
|
@ -70,6 +71,26 @@ struct client {
|
||||||
std::vector<llama_token> tokens_prev;
|
std::vector<llama_token> tokens_prev;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void print_date_time() {
|
||||||
|
std::time_t current_time = std::time(nullptr);
|
||||||
|
std::tm* local_time = std::localtime(¤t_time);
|
||||||
|
char buffer[80];
|
||||||
|
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
||||||
|
|
||||||
|
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Define a split string function to ...
|
||||||
|
static std::vector<std::string> split_string(const std::string& input, char delimiter) {
|
||||||
|
std::vector<std::string> tokens;
|
||||||
|
std::istringstream stream(input);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(stream, token, delimiter)) {
|
||||||
|
tokens.push_back(token);
|
||||||
|
}
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
srand(1234);
|
srand(1234);
|
||||||
|
|
||||||
|
@ -104,6 +125,23 @@ int main(int argc, char ** argv) {
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// load the prompts from an external file if there are any
|
||||||
|
if (params.prompt.empty()) {
|
||||||
|
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||||
|
} else {
|
||||||
|
// Output each line of the input params.prompts vector and copy to k_prompts
|
||||||
|
int index = 0;
|
||||||
|
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||||
|
|
||||||
|
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
||||||
|
for (const auto& prompt : prompts) {
|
||||||
|
k_prompts.resize(index + 1);
|
||||||
|
k_prompts[index] = prompt;
|
||||||
|
index++;
|
||||||
|
printf("%3d prompt: %s\n", index, prompt.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
|
@ -233,7 +271,7 @@ int main(int argc, char ** argv) {
|
||||||
client.n_decoded = 0;
|
client.n_decoded = 0;
|
||||||
client.i_batch = batch.n_tokens - 1;
|
client.i_batch = batch.n_tokens - 1;
|
||||||
|
|
||||||
LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||||
|
|
||||||
g_seq_id += 1;
|
g_seq_id += 1;
|
||||||
|
|
||||||
|
@ -332,12 +370,12 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||||
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
|
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
|
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||||
client.id, client.seq_id, client.n_prompt, client.n_decoded,
|
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
||||||
(t_main_end - client.t_start_prompt) / 1e6,
|
(t_main_end - client.t_start_prompt) / 1e6,
|
||||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||||
n_cache_miss,
|
n_cache_miss,
|
||||||
|
@ -357,13 +395,21 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
print_date_time();
|
||||||
|
|
||||||
|
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||||
|
if (params.prompt_file.empty()) {
|
||||||
|
params.prompt_file = "used built-in defaults";
|
||||||
|
}
|
||||||
|
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||||
|
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||||
|
|
||||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
|
|
@ -114,9 +114,9 @@ node index.js
|
||||||
|
|
||||||
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||||
|
|
||||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
||||||
|
|
||||||
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
|
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
||||||
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||||
|
@ -156,6 +156,8 @@ node index.js
|
||||||
|
|
||||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
||||||
|
|
||||||
|
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||||
|
|
||||||
- **POST** `/tokenize`: Tokenize a given text.
|
- **POST** `/tokenize`: Tokenize a given text.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
|
@ -27,10 +27,10 @@ def is_present(json, key):
|
||||||
buf = json[key]
|
buf = json[key]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return False
|
return False
|
||||||
|
if json[key] == None:
|
||||||
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#convert chat to prompt
|
#convert chat to prompt
|
||||||
def convert_chat(messages):
|
def convert_chat(messages):
|
||||||
prompt = "" + args.chat_prompt.replace("\\n", "\n")
|
prompt = "" + args.chat_prompt.replace("\\n", "\n")
|
||||||
|
|
|
@ -448,7 +448,7 @@ struct llama_server_context
|
||||||
n_past = common_part(embd, prompt_tokens);
|
n_past = common_part(embd, prompt_tokens);
|
||||||
|
|
||||||
// since #3228 we now have to manually manage the KV cache
|
// since #3228 we now have to manually manage the KV cache
|
||||||
llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
embd = prompt_tokens;
|
embd = prompt_tokens;
|
||||||
if (n_past == num_prompt_tokens)
|
if (n_past == num_prompt_tokens)
|
||||||
|
@ -504,9 +504,11 @@ struct llama_server_context
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool tg = true;
|
||||||
while (n_past < embd.size())
|
while (n_past < embd.size())
|
||||||
{
|
{
|
||||||
int n_eval = (int)embd.size() - n_past;
|
int n_eval = (int)embd.size() - n_past;
|
||||||
|
tg = n_eval == 1;
|
||||||
if (n_eval > params.n_batch)
|
if (n_eval > params.n_batch)
|
||||||
{
|
{
|
||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
|
@ -532,99 +534,21 @@ struct llama_server_context
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
|
||||||
const float repeat_penalty = params.repeat_penalty;
|
|
||||||
const float alpha_presence = params.presence_penalty;
|
|
||||||
const float alpha_frequency = params.frequency_penalty;
|
|
||||||
const int mirostat = params.mirostat;
|
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
|
||||||
const bool penalize_nl = params.penalize_nl;
|
|
||||||
const int32_t n_probs = params.n_probs;
|
|
||||||
|
|
||||||
{
|
|
||||||
auto *logits = llama_get_logits(ctx);
|
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
|
||||||
for (const auto &it : params.logit_bias)
|
|
||||||
{
|
|
||||||
logits[it.first] += it.second;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(llama_n_vocab(model));
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++)
|
|
||||||
{
|
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
// Apply penalties
|
const int32_t n_probs = params.n_probs;
|
||||||
float nl_logit = logits[llama_token_nl(ctx)];
|
if (params.temp <= 0 && n_probs > 0)
|
||||||
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
|
||||||
llama_sample_repetition_penalty(ctx, &candidates_p,
|
|
||||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, repeat_penalty);
|
|
||||||
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
|
||||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, alpha_frequency, alpha_presence);
|
|
||||||
if (!penalize_nl)
|
|
||||||
{
|
|
||||||
logits[llama_token_nl(ctx)] = nl_logit;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
|
||||||
llama_sample_grammar(ctx, &candidates_p, grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (temp <= 0)
|
|
||||||
{
|
|
||||||
// Greedy sampling
|
|
||||||
result.tok = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
if (n_probs > 0)
|
|
||||||
{
|
{
|
||||||
|
// For llama_sample_token_greedy we need to sort candidates
|
||||||
llama_sample_softmax(ctx, &candidates_p);
|
llama_sample_softmax(ctx, &candidates_p);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (mirostat == 1)
|
|
||||||
{
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
|
||||||
}
|
|
||||||
else if (mirostat == 2)
|
|
||||||
{
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Temperature sampling
|
|
||||||
size_t min_keep = std::max(1, n_probs);
|
|
||||||
llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
|
|
||||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
|
|
||||||
llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
|
|
||||||
llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
|
|
||||||
llama_sample_temp(ctx, &candidates_p, temp);
|
|
||||||
result.tok = llama_sample_token(ctx, &candidates_p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
|
||||||
llama_grammar_accept_token(ctx, grammar, result.tok);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
||||||
{
|
{
|
||||||
|
@ -633,8 +557,10 @@ struct llama_server_context
|
||||||
|
|
||||||
last_n_tokens.erase(last_n_tokens.begin());
|
last_n_tokens.erase(last_n_tokens.begin());
|
||||||
last_n_tokens.push_back(result.tok);
|
last_n_tokens.push_back(result.tok);
|
||||||
|
if (tg) {
|
||||||
num_tokens_predicted++;
|
num_tokens_predicted++;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// add it to the context
|
// add it to the context
|
||||||
embd.push_back(result.tok);
|
embd.push_back(result.tok);
|
||||||
|
@ -1011,7 +937,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_adapter.push_back({argv[i], 1.0f});
|
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
else if (arg == "--lora-scaled")
|
else if (arg == "--lora-scaled")
|
||||||
|
@ -1027,7 +953,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
|
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
else if (arg == "--lora-base")
|
else if (arg == "--lora-base")
|
||||||
|
@ -1124,8 +1050,6 @@ static json format_timings(llama_server_context &llama)
|
||||||
{
|
{
|
||||||
const auto timings = llama_get_timings(llama.ctx);
|
const auto timings = llama_get_timings(llama.ctx);
|
||||||
|
|
||||||
assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
|
|
||||||
|
|
||||||
return json{
|
return json{
|
||||||
{"prompt_n", timings.n_p_eval},
|
{"prompt_n", timings.n_p_eval},
|
||||||
{"prompt_ms", timings.t_p_eval_ms},
|
{"prompt_ms", timings.t_p_eval_ms},
|
||||||
|
|
|
@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("out of drafted tokens\n");
|
LOG("out of drafted tokens\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||||
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
|
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
|
|
||||||
|
@ -257,7 +257,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate the drafted token on the draft model
|
// evaluate the drafted token on the draft model
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
|
||||||
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
|
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
|
||||||
++n_past_cur;
|
++n_past_cur;
|
||||||
|
|
||||||
|
@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate the target model on the drafted tokens
|
// evaluate the target model on the drafted tokens
|
||||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
|
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
|
||||||
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
|
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
|
||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
|
|
||||||
|
|
|
@ -364,7 +364,7 @@ class ModelParams:
|
||||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||||
|
|
||||||
def tensor_name(key, bid=None):
|
def tensor_name(key, bid=None):
|
||||||
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
|
return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
|
||||||
|
|
||||||
class Layer:
|
class Layer:
|
||||||
def __init__(self, params, bid):
|
def __init__(self, params, bid):
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp ${src}/llama.h $out/include/
|
cp ${src}/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
packages.default = pkgs.stdenv.mkDerivation {
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
|
|
169
ggml-alloc.c
169
ggml-alloc.c
|
@ -1,4 +1,5 @@
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
|
@ -6,25 +7,6 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef __has_include
|
|
||||||
#if __has_include(<unistd.h>)
|
|
||||||
#include <unistd.h>
|
|
||||||
#if defined(_POSIX_MAPPED_FILES)
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
#define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#include <memoryapi.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
@ -80,8 +62,9 @@ struct free_block {
|
||||||
#define MAX_FREE_BLOCKS 256
|
#define MAX_FREE_BLOCKS 256
|
||||||
|
|
||||||
struct ggml_allocr {
|
struct ggml_allocr {
|
||||||
|
struct ggml_backend_buffer * buffer;
|
||||||
|
bool buffer_owned;
|
||||||
void * data;
|
void * data;
|
||||||
size_t size;
|
|
||||||
size_t alignment;
|
size_t alignment;
|
||||||
int n_free_blocks;
|
int n_free_blocks;
|
||||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||||
|
@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
|
||||||
return ggml_nbytes(tensor);
|
|
||||||
|
|
||||||
UNUSED(alloc);
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if a tensor is allocated by this buffer
|
// check if a tensor is allocated by this buffer
|
||||||
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
||||||
void * ptr = tensor->data;
|
return tensor->buffer == alloc->buffer;
|
||||||
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||||
|
@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
|
||||||
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
||||||
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
||||||
#endif
|
|
||||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
|
|
||||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||||
|
@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
||||||
|
|
||||||
tensor->data = addr;
|
tensor->data = addr;
|
||||||
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
||||||
|
tensor->buffer = alloc->buffer;
|
||||||
|
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
add_allocated_tensor(alloc, tensor);
|
add_allocated_tensor(alloc, tensor);
|
||||||
|
@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
||||||
|
|
||||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||||
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||||
void * ptr = tensor->data;
|
|
||||||
|
|
||||||
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
||||||
// the tensor was not allocated in this buffer
|
// the tensor was not allocated in this buffer
|
||||||
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
||||||
// the easiest way to deal with this is just to ignore it
|
// the easiest way to deal with this is just to ignore it
|
||||||
|
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
void * ptr = tensor->data;
|
||||||
|
|
||||||
|
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
||||||
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
|
|
||||||
|
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
remove_allocated_tensor(alloc, tensor);
|
remove_allocated_tensor(alloc, tensor);
|
||||||
|
@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
||||||
alloc->n_free_blocks = 1;
|
alloc->n_free_blocks = 1;
|
||||||
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
||||||
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
||||||
alloc->free_blocks[0].size = alloc->size - align_offset;
|
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
||||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
||||||
|
|
||||||
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
||||||
|
|
||||||
*alloc = (struct ggml_allocr){
|
*alloc = (struct ggml_allocr){
|
||||||
/*.data = */ data,
|
/*.buffer = */ buffer,
|
||||||
/*.size = */ size,
|
/*.buffer_owned = */ true,
|
||||||
|
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
||||||
/*.alignment = */ alignment,
|
/*.alignment = */ alignment,
|
||||||
/*.n_free_blocks = */ 0,
|
/*.n_free_blocks = */ 0,
|
||||||
/*.free_blocks = */ {{0}},
|
/*.free_blocks = */ {{0}},
|
||||||
|
@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
||||||
return alloc;
|
return alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
// OS specific functions to allocate and free uncommitted virtual memory
|
|
||||||
static void * alloc_vmem(size_t size) {
|
|
||||||
#if defined(_WIN32)
|
|
||||||
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
|
||||||
#elif defined(_POSIX_MAPPED_FILES)
|
|
||||||
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
|
||||||
if (ptr == MAP_FAILED) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
#else
|
|
||||||
// use a fixed address for other platforms
|
|
||||||
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
|
||||||
return (void *)base_addr;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static void free_vmem(void * base_addr, size_t size) {
|
|
||||||
#if defined(_WIN32)
|
|
||||||
VirtualFree(base_addr, 0, MEM_RELEASE);
|
|
||||||
UNUSED(size);
|
|
||||||
#elif defined(_POSIX_MAPPED_FILES)
|
|
||||||
munmap(base_addr, size);
|
|
||||||
#else
|
|
||||||
// nothing to do
|
|
||||||
UNUSED(base_addr);
|
|
||||||
UNUSED(size);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// allocate uncommitted virtual memory to measure the size of the graph
|
|
||||||
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
|
||||||
// 128GB for 64-bit, 1GB for 32-bit
|
|
||||||
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
|
||||||
do {
|
|
||||||
*base_addr = alloc_vmem(*size);
|
|
||||||
if (*base_addr != NULL) {
|
|
||||||
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// try again with half the size
|
|
||||||
*size /= 2;
|
|
||||||
} while (*size > 0);
|
|
||||||
|
|
||||||
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void free_measure_vmem(void * base_addr, size_t size) {
|
|
||||||
free_vmem(base_addr, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
|
||||||
|
alloc->measure = true;
|
||||||
|
|
||||||
void * base_addr;
|
return alloc;
|
||||||
size_t size;
|
}
|
||||||
|
|
||||||
alloc_measure_vmem(&base_addr, &size);
|
struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
||||||
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
||||||
|
|
||||||
*alloc = (struct ggml_allocr){
|
*alloc = (struct ggml_allocr){
|
||||||
/*.data = */ base_addr,
|
/*.buffer = */ buffer,
|
||||||
/*.size = */ size,
|
/*.buffer_owned = */ false,
|
||||||
/*.alignment = */ alignment,
|
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
||||||
|
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
||||||
/*.n_free_blocks = */ 0,
|
/*.n_free_blocks = */ 0,
|
||||||
/*.free_blocks = */ {{0}},
|
/*.free_blocks = */ {{0}},
|
||||||
/*.hash_table = */ {{0}},
|
/*.hash_table = */ {{0}},
|
||||||
/*.max_size = */ 0,
|
/*.max_size = */ 0,
|
||||||
/*.measure = */ true,
|
/*.measure = */ false,
|
||||||
/*.parse_seq = */ {0},
|
/*.parse_seq = */ {0},
|
||||||
/*.parse_seq_len = */ 0,
|
/*.parse_seq_len = */ 0,
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
||||||
if (alloc->measure) {
|
if (alloc->buffer_owned) {
|
||||||
free_measure_vmem(alloc->data, alloc->size);
|
ggml_backend_buffer_free(alloc->buffer);
|
||||||
}
|
}
|
||||||
free(alloc);
|
free(alloc);
|
||||||
}
|
}
|
||||||
|
@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
case GGML_OP_CONT:
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
||||||
|
assert(view->view_src != NULL && view->view_src->data != NULL);
|
||||||
|
view->backend = view->view_src->backend;
|
||||||
|
view->buffer = view->view_src->buffer;
|
||||||
|
view->data = (char *)view->view_src->data + view->view_offs;
|
||||||
|
|
||||||
|
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
||||||
|
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
||||||
|
assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend);
|
||||||
|
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
||||||
|
}
|
||||||
|
|
||||||
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
||||||
struct hash_node * ht = alloc->hash_table;
|
struct hash_node * ht = alloc->hash_table;
|
||||||
if (node->data == NULL) {
|
if (node->data == NULL) {
|
||||||
if (ggml_is_view(node)) {
|
if (ggml_is_view(node)) {
|
||||||
assert(node->view_src->data != NULL);
|
init_view(alloc, node);
|
||||||
node->data = (char *)node->view_src->data + node->view_offs;
|
|
||||||
} else {
|
} else {
|
||||||
// see if we can reuse a parent's buffer (inplace)
|
// see if we can reuse a parent's buffer (inplace)
|
||||||
if (ggml_op_can_inplace(node->op)) {
|
if (ggml_op_can_inplace(node->op)) {
|
||||||
|
@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||||
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
||||||
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
||||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||||
node->data = parent->data;
|
node->view_src = view_src;
|
||||||
|
view_src_hn->n_views += 1;
|
||||||
|
init_view(alloc, node);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||||
node->data = parent->data;
|
node->view_src = parent;
|
||||||
|
p_hn->n_views += 1;
|
||||||
|
init_view(alloc, node);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_allocr_alloc_graph_tensors_n(
|
size_t ggml_allocr_alloc_graph_n(
|
||||||
struct ggml_allocr * alloc,
|
struct ggml_allocr * alloc,
|
||||||
struct ggml_cgraph ** graphs, int n_graphs,
|
struct ggml_cgraph ** graphs, int n_graphs,
|
||||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
||||||
|
@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||||
if (ggml_is_view(node)) {
|
if (ggml_is_view(node)) {
|
||||||
struct ggml_tensor * view_src = node->view_src;
|
struct ggml_tensor * view_src = node->view_src;
|
||||||
hash_get(ht, view_src)->n_views += 1;
|
hash_get(ht, view_src)->n_views += 1;
|
||||||
|
if (node->buffer == NULL && node->data != NULL) {
|
||||||
|
// view of a pre-allocated tensor, didn't call init_view() yet
|
||||||
|
init_view(alloc, node);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
hash_get(ht, parent)->n_children += 1;
|
hash_get(ht, parent)->n_children += 1;
|
||||||
|
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
||||||
|
init_view(alloc, parent);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
||||||
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
||||||
|
|
|
@ -6,9 +6,11 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
struct ggml_backend_buffer;
|
||||||
|
|
||||||
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
||||||
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||||
|
GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
||||||
|
|
||||||
// tell the allocator to parse nodes following the order described in the list
|
// tell the allocator to parse nodes following the order described in the list
|
||||||
// you should call this if your graph are optimized to execute out-of-order
|
// you should call this if your graph are optimized to execute out-of-order
|
||||||
|
@ -21,6 +23,10 @@ GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor
|
||||||
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
||||||
GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
|
GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
|
||||||
|
|
||||||
|
GGML_API size_t ggml_allocr_alloc_graph_n(
|
||||||
|
struct ggml_allocr * alloc,
|
||||||
|
struct ggml_cgraph ** graphs, int n_graphs,
|
||||||
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
385
ggml-backend.c
Normal file
385
ggml-backend.c
Normal file
|
@ -0,0 +1,385 @@
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
// backend buffer
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
|
struct ggml_backend * backend,
|
||||||
|
struct ggml_backend_buffer_i iface,
|
||||||
|
ggml_backend_buffer_context_t context,
|
||||||
|
size_t size) {
|
||||||
|
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
||||||
|
|
||||||
|
GGML_ASSERT(iface.get_base != NULL);
|
||||||
|
|
||||||
|
(*buffer) = (struct ggml_backend_buffer) {
|
||||||
|
/* .interface = */ iface,
|
||||||
|
/* .backend = */ backend,
|
||||||
|
/* .context = */ context,
|
||||||
|
/* .size = */ size,
|
||||||
|
};
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
|
if (buffer->iface.free_buffer != NULL) {
|
||||||
|
buffer->iface.free_buffer(buffer);
|
||||||
|
}
|
||||||
|
free(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
||||||
|
return ggml_backend_get_alignment(buffer->backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
return buffer->iface.get_base(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||||
|
return buffer->size;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
|
if (buffer->iface.get_alloc_size) {
|
||||||
|
return buffer->iface.get_alloc_size(buffer, tensor);
|
||||||
|
}
|
||||||
|
return ggml_nbytes(tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
|
if (buffer->iface.init_tensor) {
|
||||||
|
buffer->iface.init_tensor(buffer, tensor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
|
if (buffer->iface.free_tensor) {
|
||||||
|
buffer->iface.free_tensor(buffer, tensor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// backend
|
||||||
|
|
||||||
|
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
|
||||||
|
return tensor->buffer->backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * ggml_backend_name(ggml_backend_t backend) {
|
||||||
|
return backend->iface.get_name(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_free(ggml_backend_t backend) {
|
||||||
|
backend->iface.free(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||||
|
return backend->iface.alloc_buffer(backend, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
||||||
|
return backend->iface.get_alignment(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||||||
|
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
||||||
|
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
||||||
|
backend->iface.synchronize(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
return backend->iface.graph_plan_create(backend, cgraph);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
backend->iface.graph_plan_free(backend, plan);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
backend->iface.graph_plan_compute(backend, plan);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
backend->iface.graph_compute(backend, cgraph);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
return backend->iface.supports_op(backend, op);
|
||||||
|
}
|
||||||
|
|
||||||
|
// backend copy
|
||||||
|
|
||||||
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||||
|
if (a->type != b->type) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
|
if (a->ne[i] != b->ne[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (a->nb[i] != b->nb[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
|
//printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
|
||||||
|
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
||||||
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
||||||
|
|
||||||
|
// printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
||||||
|
|
||||||
|
if (src == dst) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: allow backends to support copy to/from same backend
|
||||||
|
|
||||||
|
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
|
||||||
|
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
|
||||||
|
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
|
||||||
|
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
|
||||||
|
} else {
|
||||||
|
// shouldn't be hit when copying from/to CPU
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
|
||||||
|
#endif
|
||||||
|
size_t nbytes = ggml_nbytes(src);
|
||||||
|
void * data = malloc(nbytes);
|
||||||
|
ggml_backend_tensor_get(src, data, 0, nbytes);
|
||||||
|
ggml_backend_tensor_set(dst, data, 0, nbytes);
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// backend CPU
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context {
|
||||||
|
int n_threads;
|
||||||
|
void * work_data;
|
||||||
|
size_t work_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
||||||
|
return "CPU";
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
free(cpu_ctx->work_data);
|
||||||
|
free(cpu_ctx);
|
||||||
|
free(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
return (void *)buffer->context;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
free(buffer->context);
|
||||||
|
UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||||
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||||
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
|
/* .free_tensor = */ NULL, // no cleanup required
|
||||||
|
};
|
||||||
|
|
||||||
|
// for buffers from ptr, free is not called
|
||||||
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||||
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
||||||
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .init_tensor = */ NULL,
|
||||||
|
/* .free_tensor = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||||
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||||
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
|
||||||
|
return TENSOR_ALIGNMENT;
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
|
||||||
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
|
||||||
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
|
// for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
|
||||||
|
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_backend_plan_cpu {
|
||||||
|
struct ggml_cplan cplan;
|
||||||
|
struct ggml_cgraph cgraph;
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
|
cpu_plan->cgraph = *cgraph;
|
||||||
|
|
||||||
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
return cpu_plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
|
free(cpu_plan->cplan.work_data);
|
||||||
|
free(cpu_plan);
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
|
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
|
|
||||||
|
if (cpu_ctx->work_size < cplan.work_size) {
|
||||||
|
// TODO: may be faster to free and use malloc to avoid the copy
|
||||||
|
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
|
||||||
|
cpu_ctx->work_size = cplan.work_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
cplan.work_data = cpu_ctx->work_data;
|
||||||
|
|
||||||
|
ggml_graph_compute(cgraph, &cplan);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
return true;
|
||||||
|
UNUSED(backend);
|
||||||
|
UNUSED(op);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_i cpu_backend_i = {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_name,
|
||||||
|
/* .free = */ ggml_backend_cpu_free,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_get_alignment,
|
||||||
|
/* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async,
|
||||||
|
/* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async,
|
||||||
|
/* .synchronize = */ ggml_backend_cpu_synchronize,
|
||||||
|
/* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from,
|
||||||
|
/* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to,
|
||||||
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||||
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||||
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||||
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||||
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
||||||
|
|
||||||
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
ctx->work_data = NULL;
|
||||||
|
ctx->work_size = 0;
|
||||||
|
|
||||||
|
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
||||||
|
|
||||||
|
*cpu_backend = (struct ggml_backend) {
|
||||||
|
/* .interface = */ cpu_backend_i,
|
||||||
|
/* .context = */ ctx
|
||||||
|
};
|
||||||
|
return cpu_backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||||
|
return backend->iface.get_name == ggml_backend_cpu_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
|
ctx->n_threads = n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
|
||||||
|
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
|
||||||
|
}
|
143
ggml-backend.h
Normal file
143
ggml-backend.h
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
struct ggml_backend;
|
||||||
|
struct ggml_backend_buffer;
|
||||||
|
|
||||||
|
// type-erased backend-specific types / wrappers
|
||||||
|
typedef void * ggml_backend_context_t;
|
||||||
|
typedef void * ggml_backend_graph_plan_t;
|
||||||
|
typedef void * ggml_backend_buffer_context_t;
|
||||||
|
|
||||||
|
// avoid accessing internals of these types
|
||||||
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||||
|
|
||||||
|
//
|
||||||
|
// backend buffer
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_backend_buffer_i {
|
||||||
|
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||||
|
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
||||||
|
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
||||||
|
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
||||||
|
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: hide behind API
|
||||||
|
struct ggml_backend_buffer {
|
||||||
|
struct ggml_backend_buffer_i iface;
|
||||||
|
|
||||||
|
ggml_backend_t backend;
|
||||||
|
ggml_backend_buffer_context_t context;
|
||||||
|
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
// backend buffer functions
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
|
struct ggml_backend * backend,
|
||||||
|
struct ggml_backend_buffer_i iface,
|
||||||
|
ggml_backend_buffer_context_t context,
|
||||||
|
size_t size);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||||
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||||
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
//
|
||||||
|
// backend
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_backend_i {
|
||||||
|
const char * (*get_name)(ggml_backend_t backend);
|
||||||
|
|
||||||
|
void (*free)(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// buffer allocation
|
||||||
|
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
||||||
|
|
||||||
|
// get buffer alignment
|
||||||
|
size_t (*get_alignment)(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// tensor data access
|
||||||
|
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
||||||
|
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
void (*synchronize) (ggml_backend_t backend);
|
||||||
|
|
||||||
|
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
||||||
|
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
// compute graph with a plan
|
||||||
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
|
// compute graph without a plan
|
||||||
|
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
// check if the backend supports an operation
|
||||||
|
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: hide behind API
|
||||||
|
struct ggml_backend {
|
||||||
|
struct ggml_backend_i iface;
|
||||||
|
|
||||||
|
ggml_backend_context_t context;
|
||||||
|
};
|
||||||
|
|
||||||
|
// backend helper functions
|
||||||
|
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||||
|
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||||
|
|
||||||
|
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
// tensor copy between different backends
|
||||||
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
//
|
||||||
|
// CPU backend
|
||||||
|
//
|
||||||
|
|
||||||
|
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||||
|
|
||||||
|
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
531
ggml-cuda.cu
531
ggml-cuda.cu
|
@ -62,6 +62,7 @@
|
||||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||||
#define cudaMemcpyKind hipMemcpyKind
|
#define cudaMemcpyKind hipMemcpyKind
|
||||||
#define cudaMemset hipMemset
|
#define cudaMemset hipMemset
|
||||||
|
#define cudaMemsetAsync hipMemsetAsync
|
||||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||||
#define cudaSetDevice hipSetDevice
|
#define cudaSetDevice hipSetDevice
|
||||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||||
|
@ -419,6 +420,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||||
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
||||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||||
|
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
||||||
|
|
||||||
// dmmv = dequantize_mul_mat_vec
|
// dmmv = dequantize_mul_mat_vec
|
||||||
#ifndef GGML_CUDA_DMMV_X
|
#ifndef GGML_CUDA_DMMV_X
|
||||||
|
@ -1574,6 +1576,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
|
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
|
||||||
|
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
||||||
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
|
||||||
|
if (col >= ncols) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int r = y[row];
|
||||||
|
|
||||||
|
// copy x[r*ncols + col] to dst[row*ncols + col]
|
||||||
|
const int xi = r*ncols + col;
|
||||||
|
const int di = row*ncols + col;
|
||||||
|
|
||||||
|
const int ib = xi/qk; // block index
|
||||||
|
const int iqs = (xi%qk)/qr; // quant index
|
||||||
|
const int iybs = di - di%qk; // y block start index
|
||||||
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||||
|
|
||||||
|
// dequantize
|
||||||
|
dfloat2 v;
|
||||||
|
dequantize_kernel(x, ib, iqs, v);
|
||||||
|
|
||||||
|
dst[iybs + iqs + 0] = v.x;
|
||||||
|
dst[iybs + iqs + y_offset] = v.y;
|
||||||
|
}
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
||||||
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
||||||
|
@ -4555,6 +4585,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
||||||
dst[i] = scale * x[i];
|
dst[i] = scale * x[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<int qk, int qr, dequantize_kernel_t dq>
|
||||||
|
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
||||||
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||||
|
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||||
|
const dim3 block_nums(block_num_x, nrows, 1);
|
||||||
|
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
||||||
|
}
|
||||||
|
|
||||||
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
||||||
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
||||||
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
||||||
|
@ -5703,7 +5742,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||||
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||||
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
||||||
kind = cudaMemcpyDeviceToDevice;
|
kind = cudaMemcpyDeviceToDevice;
|
||||||
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
src_ptr = (char *) extra->data_device[id];
|
src_ptr = (char *) extra->data_device[id];
|
||||||
|
@ -5739,6 +5778,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_op_repeat(
|
||||||
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
|
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
||||||
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
||||||
|
const int64_t ne0 = dst->ne[0];
|
||||||
|
const int64_t ne1 = dst->ne[1];
|
||||||
|
const int64_t ne2 = dst->ne[2];
|
||||||
|
const int64_t ne3 = dst->ne[3];
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t ne01 = src0->ne[1];
|
||||||
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
const int64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
|
const size_t nb0 = dst->nb[0];
|
||||||
|
const size_t nb1 = dst->nb[1];
|
||||||
|
const size_t nb2 = dst->nb[2];
|
||||||
|
const size_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
const size_t nb00 = src0->nb[0];
|
||||||
|
const size_t nb01 = src0->nb[1];
|
||||||
|
const size_t nb02 = src0->nb[2];
|
||||||
|
const size_t nb03 = src0->nb[3];
|
||||||
|
|
||||||
|
const int nr0 = (int)(ne0/ne00);
|
||||||
|
const int nr1 = (int)(ne1/ne01);
|
||||||
|
const int nr2 = (int)(ne2/ne02);
|
||||||
|
const int nr3 = (int)(ne3/ne03);
|
||||||
|
|
||||||
|
// TODO: support for transposed / permuted tensors
|
||||||
|
GGML_ASSERT(nb0 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
|
|
||||||
|
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
||||||
|
for (int i3 = 0; i3 < nr3; i3++) {
|
||||||
|
for (int k3 = 0; k3 < ne03; k3++) {
|
||||||
|
for (int i2 = 0; i2 < nr2; i2++) {
|
||||||
|
for (int k2 = 0; k2 < ne02; k2++) {
|
||||||
|
for (int i1 = 0; i1 < nr1; i1++) {
|
||||||
|
for (int k1 = 0; k1 < ne01; k1++) {
|
||||||
|
for (int i0 = 0; i0 < nr0; i0++) {
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(
|
||||||
|
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
||||||
|
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
||||||
|
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(void) src1;
|
||||||
|
(void) src1_d;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_op_get_rows(
|
||||||
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
|
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
const int ncols = src0->ne[0];
|
||||||
|
const int nrows = ggml_nelements(src1);
|
||||||
|
|
||||||
|
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// TODO: k-quants
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inline void ggml_cuda_op_add(
|
inline void ggml_cuda_op_add(
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||||
|
@ -6343,7 +6483,14 @@ inline void ggml_cuda_op_scale(
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const float scale = ((float *) src1->data)[0];
|
float scale;
|
||||||
|
// HACK: support for ggml backend interface
|
||||||
|
if (src1->backend == GGML_BACKEND_CPU) {
|
||||||
|
scale = ((float *) src1->data)[0];
|
||||||
|
} else {
|
||||||
|
// TODO: pass pointer to kernel instead of copying to host
|
||||||
|
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
||||||
|
}
|
||||||
|
|
||||||
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
@ -6362,9 +6509,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||||
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
||||||
|
@ -6505,9 +6652,9 @@ static void ggml_cuda_op_mul_mat(
|
||||||
const size_t q8_1_ts = sizeof(block_q8_1);
|
const size_t q8_1_ts = sizeof(block_q8_1);
|
||||||
const size_t q8_1_bs = QK8_1;
|
const size_t q8_1_bs = QK8_1;
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||||
|
@ -6585,7 +6732,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
if (convert_src1_to_q8_1) {
|
if (convert_src1_to_q8_1) {
|
||||||
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
||||||
|
|
||||||
if (split && src1_on_device && src1_is_contiguous) {
|
if (src1_on_device && src1_is_contiguous) {
|
||||||
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
@ -6667,7 +6814,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
||||||
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
@ -6758,6 +6905,14 @@ static void ggml_cuda_op_mul_mat(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
||||||
}
|
}
|
||||||
|
@ -6812,13 +6967,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||||
|
|
||||||
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
||||||
|
@ -6843,13 +6998,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||||
|
|
||||||
const int64_t row_stride_x = nb01 / sizeof(half);
|
const int64_t row_stride_x = nb01 / sizeof(half);
|
||||||
|
@ -6870,7 +7025,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||||
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
||||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||||
|
@ -6935,8 +7090,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
|
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||||
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
||||||
|
@ -6991,8 +7146,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||||
|
|
||||||
const size_t nb1 = tensor->nb[1];
|
const size_t nb1 = tensor->nb[1];
|
||||||
|
|
||||||
ggml_backend backend = tensor->backend;
|
ggml_backend_type backend = tensor->backend;
|
||||||
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
|
@ -7046,7 +7201,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||||
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
extra->data_device[id] = buf;
|
extra->data_device[id] = buf;
|
||||||
|
@ -7085,17 +7239,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||||
delete extra;
|
delete extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
||||||
static size_t g_temp_tensor_extra_index = 0;
|
static size_t g_temp_tensor_extra_index = 0;
|
||||||
|
|
||||||
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||||
if (g_temp_tensor_extras == nullptr) {
|
if (g_temp_tensor_extras == nullptr) {
|
||||||
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t alloc_index = g_temp_tensor_extra_index;
|
size_t alloc_index = g_temp_tensor_extra_index;
|
||||||
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
||||||
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
return extra;
|
return extra;
|
||||||
|
@ -7123,7 +7277,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * extra;
|
ggml_tensor_extra_gpu * extra;
|
||||||
|
|
||||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
||||||
tensor->op == GGML_OP_VIEW ||
|
tensor->op == GGML_OP_VIEW ||
|
||||||
|
@ -7132,7 +7286,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
if (tensor->op == GGML_OP_VIEW) {
|
if (tensor->op == GGML_OP_VIEW) {
|
||||||
|
@ -7141,7 +7295,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
||||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||||
extra->data_device[g_main_device] = src0_ddc + offset;
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
||||||
} else if (tensor->op == GGML_OP_CPY) {
|
} else if (tensor->op == GGML_OP_CPY) {
|
||||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
||||||
void * src1_ddv = src1_extra->data_device[g_main_device];
|
void * src1_ddv = src1_extra->data_device[g_main_device];
|
||||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||||
extra->data_device[g_main_device] = src1_ddv;
|
extra->data_device[g_main_device] = src1_ddv;
|
||||||
|
@ -7183,13 +7337,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
||||||
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||||
|
|
||||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
||||||
tensor->op == GGML_OP_VIEW;
|
tensor->op == GGML_OP_VIEW;
|
||||||
|
|
||||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||||
size_t view_offset = 0;
|
size_t view_offset = 0;
|
||||||
if (tensor->op == GGML_OP_VIEW) {
|
if (tensor->op == GGML_OP_VIEW) {
|
||||||
|
@ -7207,7 +7361,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
|
@ -7270,52 +7424,41 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
switch (tensor->op) {
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
||||||
case GGML_OP_DUP:
|
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
switch (tensor->op) {
|
||||||
|
case GGML_OP_REPEAT:
|
||||||
|
func = ggml_cuda_repeat;
|
||||||
|
break;
|
||||||
|
case GGML_OP_GET_ROWS:
|
||||||
|
func = ggml_cuda_get_rows;
|
||||||
|
break;
|
||||||
|
case GGML_OP_DUP:
|
||||||
func = ggml_cuda_dup;
|
func = ggml_cuda_dup;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_add;
|
func = ggml_cuda_add;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_mul;
|
func = ggml_cuda_mul;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(tensor)) {
|
switch (ggml_get_unary_op(tensor)) {
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_gelu;
|
func = ggml_cuda_gelu;
|
||||||
break;
|
break;
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_silu;
|
func = ggml_cuda_silu;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_norm;
|
func = ggml_cuda_norm;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_rms_norm;
|
func = ggml_cuda_rms_norm;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
|
@ -7325,54 +7468,30 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
func = ggml_cuda_mul_mat;
|
func = ggml_cuda_mul_mat;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_scale;
|
func = ggml_cuda_scale;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_cpy;
|
func = ggml_cuda_cpy;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_dup;
|
func = ggml_cuda_dup;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_nop;
|
func = ggml_cuda_nop;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_diag_mask_inf;
|
func = ggml_cuda_diag_mask_inf;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_soft_max;
|
func = ggml_cuda_soft_max;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_rope;
|
func = ggml_cuda_rope;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ALIBI:
|
case GGML_OP_ALIBI:
|
||||||
if (!any_on_device) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_alibi;
|
func = ggml_cuda_alibi;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -7400,3 +7519,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||||
snprintf(description, description_size, "%s", prop.name);
|
snprintf(description, description_size, "%s", prop.name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// backend interface
|
||||||
|
|
||||||
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
|
struct ggml_backend_context_cuda {
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||||
|
return GGML_CUDA_NAME;
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||||
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
delete cuda_ctx;
|
||||||
|
delete backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_backend_buffer_context_cuda {
|
||||||
|
void * device;
|
||||||
|
|
||||||
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
||||||
|
size_t temp_tensor_extra_index = 0;
|
||||||
|
|
||||||
|
~ggml_backend_buffer_context_cuda() {
|
||||||
|
delete[] temp_tensor_extras;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||||
|
if (temp_tensor_extras == nullptr) {
|
||||||
|
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t alloc_index = temp_tensor_extra_index;
|
||||||
|
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
||||||
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
||||||
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
|
return extra;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
CUDA_CHECK(cudaFree(ctx->device));
|
||||||
|
delete ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
return ctx->device;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
|
int64_t row_low = 0;
|
||||||
|
int64_t row_high = ggml_nrows(tensor);
|
||||||
|
int64_t nrows_split = row_high - row_low;
|
||||||
|
|
||||||
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||||
|
|
||||||
|
int64_t ne0 = tensor->ne[0];
|
||||||
|
|
||||||
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||||
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
||||||
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return size;
|
||||||
|
|
||||||
|
UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||||
|
assert(tensor->view_src->buffer->backend == buffer->backend);
|
||||||
|
tensor->backend = tensor->view_src->backend;
|
||||||
|
tensor->extra = tensor->view_src->extra;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
||||||
|
|
||||||
|
extra->data_device[g_main_device] = tensor->data;
|
||||||
|
|
||||||
|
tensor->backend = GGML_BACKEND_GPU;
|
||||||
|
tensor->extra = extra;
|
||||||
|
|
||||||
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
|
// initialize padding to 0 to avoid possible NaN values
|
||||||
|
int64_t row_low = 0;
|
||||||
|
int64_t row_high = ggml_nrows(tensor);
|
||||||
|
int64_t nrows_split = row_high - row_low;
|
||||||
|
|
||||||
|
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
||||||
|
size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
|
||||||
|
|
||||||
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||||
|
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
||||||
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||||
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||||
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
|
||||||
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
||||||
|
/* .free_tensor = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||||
|
ggml_cuda_set_device(g_main_device);
|
||||||
|
|
||||||
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
||||||
|
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
||||||
|
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
|
||||||
|
return 128;
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||||
|
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
|
GGML_ASSERT(!"not implemented");
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
UNUSED(cgraph);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
GGML_ASSERT(!"not implemented");
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
UNUSED(plan);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
|
GGML_ASSERT(!"not implemented");
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
UNUSED(plan);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
|
ggml_cuda_set_device(g_main_device);
|
||||||
|
|
||||||
|
ggml_compute_params params = {};
|
||||||
|
params.type = GGML_TASK_COMPUTE;
|
||||||
|
params.ith = 0;
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
assert(node->backend == GGML_BACKEND_GPU);
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
if (node->src[j] != nullptr) {
|
||||||
|
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
||||||
|
if (!ok) {
|
||||||
|
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||||
|
}
|
||||||
|
GGML_ASSERT(ok);
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
if (node->type == GGML_TYPE_F32) {
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
||||||
|
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
||||||
|
ggml_type_name(node->src[0]->type),
|
||||||
|
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
||||||
|
node->src[0]->name,
|
||||||
|
node->src[1] ? node->src[1]->name : "none");
|
||||||
|
double sum = 0.0;
|
||||||
|
double sq_sum = 0.0;
|
||||||
|
for (int i = 0; i < ggml_nelements(node); i++) {
|
||||||
|
printf("%f ", tmp[i]);
|
||||||
|
sum += tmp[i];
|
||||||
|
sq_sum += tmp[i]*tmp[i];
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
printf("sum: %f, ", sum);
|
||||||
|
printf("sq_sum: %f\n", sq_sum);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_i cuda_backend_i = {
|
||||||
|
/* .get_name = */ ggml_backend_cuda_name,
|
||||||
|
/* .free = */ ggml_backend_cuda_free,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cuda_get_alignment,
|
||||||
|
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
||||||
|
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
||||||
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||||
|
/* .cpy_tensor_from = */ nullptr,
|
||||||
|
/* .cpy_tensor_to = */ nullptr,
|
||||||
|
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
||||||
|
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
||||||
|
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
||||||
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||||
|
/* .supports_op = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_cuda_init() {
|
||||||
|
ggml_init_cublas(); // TODO: remove from ggml.c
|
||||||
|
|
||||||
|
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
|
||||||
|
|
||||||
|
ggml_backend_t cuda_backend = new ggml_backend {
|
||||||
|
/* .interface = */ cuda_backend_i,
|
||||||
|
/* .context = */ ctx
|
||||||
|
};
|
||||||
|
|
||||||
|
return cuda_backend;
|
||||||
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_HIPBLAS
|
#ifdef GGML_USE_HIPBLAS
|
||||||
#define GGML_CUDA_NAME "ROCm"
|
#define GGML_CUDA_NAME "ROCm"
|
||||||
|
@ -42,6 +43,9 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s
|
||||||
GGML_API int ggml_cuda_get_device_count(void);
|
GGML_API int ggml_cuda_get_device_count(void);
|
||||||
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
|
|
||||||
|
// backend API
|
||||||
|
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
19
ggml-metal.h
19
ggml-metal.h
|
@ -20,6 +20,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
@ -35,10 +36,15 @@ struct ggml_cgraph;
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
//
|
||||||
|
// internal API
|
||||||
|
// temporary exposed to user-code
|
||||||
|
//
|
||||||
|
|
||||||
struct ggml_metal_context;
|
struct ggml_metal_context;
|
||||||
|
|
||||||
|
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
// number of command buffers to use
|
// number of command buffers to use
|
||||||
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||||
|
@ -83,6 +89,17 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
||||||
// creates gf->n_threads command buffers in parallel
|
// creates gf->n_threads command buffers in parallel
|
||||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
|
||||||
|
//
|
||||||
|
// backend API
|
||||||
|
// user-code should use only these functions
|
||||||
|
//
|
||||||
|
|
||||||
|
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
|
|
||||||
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
407
ggml-metal.m
407
ggml-metal.m
|
@ -81,18 +81,18 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||||
GGML_METAL_DECL_KERNEL(norm);
|
GGML_METAL_DECL_KERNEL(norm);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||||
|
@ -109,6 +109,8 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
||||||
|
GGML_METAL_DECL_KERNEL(concat);
|
||||||
|
GGML_METAL_DECL_KERNEL(sqr);
|
||||||
|
|
||||||
#undef GGML_METAL_DECL_KERNEL
|
#undef GGML_METAL_DECL_KERNEL
|
||||||
};
|
};
|
||||||
|
@ -183,56 +185,44 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
|
|
||||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
#ifdef GGML_SWIFT
|
// load library
|
||||||
// load the default.metallib file
|
|
||||||
{
|
{
|
||||||
|
NSBundle * bundle = nil;
|
||||||
|
#ifdef SWIFT_PACKAGE
|
||||||
|
bundle = SWIFTPM_MODULE_BUNDLE;
|
||||||
|
#else
|
||||||
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||||
|
#endif
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
|
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
if (libPath != nil) {
|
||||||
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
|
|
||||||
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
|
|
||||||
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
|
|
||||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||||
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
||||||
// Load the metallib file into a Metal library
|
|
||||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||||
|
|
||||||
if (error) {
|
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
||||||
return NULL;
|
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
UNUSED(msl_library_source);
|
|
||||||
|
|
||||||
// read the source from "ggml-metal.metal" into a string and use newLibraryWithSource
|
|
||||||
{
|
|
||||||
NSError * error = nil;
|
|
||||||
|
|
||||||
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
|
||||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
|
||||||
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
|
||||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
|
|
||||||
|
|
||||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MTLCompileOptions* options = nil;
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
MTLCompileOptions* options = [MTLCompileOptions new];
|
options = [MTLCompileOptions new];
|
||||||
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
||||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
|
||||||
#else
|
|
||||||
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
|
||||||
#endif
|
#endif
|
||||||
|
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||||
|
}
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
// load kernels
|
// load kernels
|
||||||
{
|
{
|
||||||
|
@ -272,18 +262,19 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||||
GGML_METAL_ADD_KERNEL(norm);
|
GGML_METAL_ADD_KERNEL(norm);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
||||||
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||||
|
@ -294,18 +285,34 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
||||||
|
}
|
||||||
GGML_METAL_ADD_KERNEL(rope_f32);
|
GGML_METAL_ADD_KERNEL(rope_f32);
|
||||||
GGML_METAL_ADD_KERNEL(rope_f16);
|
GGML_METAL_ADD_KERNEL(rope_f16);
|
||||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
||||||
|
GGML_METAL_ADD_KERNEL(concat);
|
||||||
|
GGML_METAL_ADD_KERNEL(sqr);
|
||||||
|
|
||||||
#undef GGML_METAL_ADD_KERNEL
|
#undef GGML_METAL_ADD_KERNEL
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
|
||||||
#if TARGET_OS_OSX
|
#if TARGET_OS_OSX
|
||||||
|
// print MTL GPU family:
|
||||||
|
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
||||||
|
|
||||||
|
// determine max supported GPU family
|
||||||
|
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
|
||||||
|
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||||
|
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
|
||||||
|
if ([ctx->device supportsFamily:i]) {
|
||||||
|
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||||
|
@ -347,18 +354,19 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||||
GGML_METAL_DEL_KERNEL(norm);
|
GGML_METAL_DEL_KERNEL(norm);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
||||||
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||||
|
@ -369,12 +377,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||||
|
}
|
||||||
GGML_METAL_DEL_KERNEL(rope_f32);
|
GGML_METAL_DEL_KERNEL(rope_f32);
|
||||||
GGML_METAL_DEL_KERNEL(rope_f16);
|
GGML_METAL_DEL_KERNEL(rope_f16);
|
||||||
GGML_METAL_DEL_KERNEL(alibi_f32);
|
GGML_METAL_DEL_KERNEL(alibi_f32);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
||||||
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
||||||
|
GGML_METAL_DEL_KERNEL(concat);
|
||||||
|
GGML_METAL_DEL_KERNEL(sqr);
|
||||||
|
|
||||||
#undef GGML_METAL_DEL_KERNEL
|
#undef GGML_METAL_DEL_KERNEL
|
||||||
|
|
||||||
|
@ -431,7 +442,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||||
|
|
||||||
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
||||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
||||||
*offs = (size_t) ioffs;
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
|
@ -766,6 +777,43 @@ void ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
// noop
|
// noop
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_CONCAT:
|
||||||
|
{
|
||||||
|
|
||||||
|
int64_t nb = ne00;
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_concat];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||||
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||||
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||||
|
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
|
||||||
|
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
|
||||||
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
|
||||||
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
|
||||||
|
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
|
||||||
|
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
|
||||||
|
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
|
||||||
|
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
|
||||||
|
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
|
||||||
|
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
|
||||||
|
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
|
||||||
|
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
|
||||||
|
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20];
|
||||||
|
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21];
|
||||||
|
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22];
|
||||||
|
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23];
|
||||||
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||||
|
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||||
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||||
|
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||||
|
|
||||||
|
const int nth = MIN(1024, ne0);
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
} break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
@ -903,6 +951,17 @@ void ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_SQR:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_sqr];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
const int nth = MIN(32, ne00);
|
const int nth = MIN(32, ne00);
|
||||||
|
@ -944,21 +1003,46 @@ void ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
{
|
{
|
||||||
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
|
||||||
|
|
||||||
GGML_ASSERT(ne00 == ne10);
|
GGML_ASSERT(ne00 == ne10);
|
||||||
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
|
||||||
uint gqa = ne12/ne02;
|
|
||||||
GGML_ASSERT(ne03 == ne13);
|
GGML_ASSERT(ne03 == ne13);
|
||||||
|
|
||||||
|
const uint gqa = ne12/ne02;
|
||||||
|
|
||||||
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||||
|
// to the matrix-vector kernel
|
||||||
|
int ne11_mm_min = 1;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||||
|
// these numbers do not translate to other devices or model sizes
|
||||||
|
// TODO: need to find a better approach
|
||||||
|
if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||||
|
switch (src0t) {
|
||||||
|
case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||||
|
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||||
|
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||||
|
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||||
|
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||||
|
case GGML_TYPE_Q5_0: // not tested yet
|
||||||
|
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||||
|
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||||
|
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||||
|
default: ne11_mm_min = 1; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||||
if (!ggml_is_transposed(src0) &&
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||||
|
!ggml_is_transposed(src0) &&
|
||||||
!ggml_is_transposed(src1) &&
|
!ggml_is_transposed(src1) &&
|
||||||
src1t == GGML_TYPE_F32 &&
|
src1t == GGML_TYPE_F32 &&
|
||||||
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
|
||||||
ne00 % 32 == 0 &&
|
ne00 % 32 == 0 &&
|
||||||
ne11 > 2) {
|
ne11 > ne11_mm_min) {
|
||||||
|
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||||
|
@ -992,12 +1076,13 @@ void ggml_metal_graph_compute(
|
||||||
int nth0 = 32;
|
int nth0 = 32;
|
||||||
int nth1 = 1;
|
int nth1 = 1;
|
||||||
int nrows = 1;
|
int nrows = 1;
|
||||||
|
//printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||||
|
|
||||||
// use custom matrix x vector kernel
|
// use custom matrix x vector kernel
|
||||||
switch (src0t) {
|
switch (src0t) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
|
||||||
nrows = 4;
|
nrows = 4;
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
@ -1005,12 +1090,12 @@ void ggml_metal_graph_compute(
|
||||||
nth0 = 32;
|
nth0 = 32;
|
||||||
nth1 = 1;
|
nth1 = 1;
|
||||||
if (ne11 * ne12 < 4) {
|
if (ne11 * ne12 < 4) {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
|
||||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
|
||||||
nrows = ne11;
|
nrows = ne11;
|
||||||
} else {
|
} else {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
|
||||||
nrows = 4;
|
nrows = 4;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -1021,7 +1106,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 8;
|
nth0 = 8;
|
||||||
nth1 = 8;
|
nth1 = 8;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
{
|
{
|
||||||
|
@ -1030,7 +1115,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 8;
|
nth0 = 8;
|
||||||
nth1 = 8;
|
nth1 = 8;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
{
|
{
|
||||||
|
@ -1039,7 +1124,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 8;
|
nth0 = 8;
|
||||||
nth1 = 8;
|
nth1 = 8;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
{
|
{
|
||||||
|
@ -1048,7 +1133,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 2;
|
nth0 = 2;
|
||||||
nth1 = 32;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
{
|
{
|
||||||
|
@ -1057,7 +1142,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 2;
|
nth0 = 2;
|
||||||
nth1 = 32;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
{
|
{
|
||||||
|
@ -1066,7 +1151,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 4; //1;
|
nth0 = 4; //1;
|
||||||
nth1 = 8; //32;
|
nth1 = 8; //32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
{
|
{
|
||||||
|
@ -1075,7 +1160,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 2;
|
nth0 = 2;
|
||||||
nth1 = 32;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
|
@ -1084,7 +1169,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 2;
|
nth0 = 2;
|
||||||
nth1 = 32;
|
nth1 = 32;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -1213,12 +1298,9 @@ void ggml_metal_graph_compute(
|
||||||
float max_bias;
|
float max_bias;
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
||||||
|
|
||||||
if (__builtin_popcount(n_head) != 1) {
|
|
||||||
GGML_ASSERT(false && "only power-of-two n_head implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
|
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
@ -1240,6 +1322,8 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||||
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
||||||
|
[encoder setBytes:&m1 length:sizeof( float) atIndex:19];
|
||||||
|
[encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
@ -1372,3 +1456,140 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// backend interface
|
||||||
|
|
||||||
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||||
|
return "Metal";
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||||
|
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
ggml_metal_free(ctx);
|
||||||
|
free(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
return (void *)buffer->context;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
free(buffer->context);
|
||||||
|
UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
||||||
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||||
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
|
/* .free_tensor = */ NULL, // no cleanup required
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
|
||||||
|
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
|
void * data = ggml_metal_host_malloc(size);
|
||||||
|
|
||||||
|
// TODO: set proper name of the buffers
|
||||||
|
ggml_metal_add_buffer(ctx, "backend", data, size, 0);
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
|
||||||
|
return 32;
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
|
||||||
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||||
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||||
|
|
||||||
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
|
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
|
ggml_metal_graph_compute(metal_ctx, cgraph);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
return true;
|
||||||
|
UNUSED(backend);
|
||||||
|
UNUSED(op);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_i metal_backend_i = {
|
||||||
|
/* .get_name = */ ggml_backend_metal_name,
|
||||||
|
/* .free = */ ggml_backend_metal_free,
|
||||||
|
/* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_metal_get_alignment,
|
||||||
|
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
|
||||||
|
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
|
||||||
|
/* .synchronize = */ ggml_backend_metal_synchronize,
|
||||||
|
/* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from,
|
||||||
|
/* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to,
|
||||||
|
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
|
||||||
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_compute = */ NULL,
|
||||||
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_metal_init(void) {
|
||||||
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
|
|
||||||
|
ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||||
|
|
||||||
|
ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
|
||||||
|
|
||||||
|
*metal_backend = (struct ggml_backend) {
|
||||||
|
/* .interface = */ metal_backend_i,
|
||||||
|
/* .context = */ ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
return metal_backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_is_metal(ggml_backend_t backend) {
|
||||||
|
return backend->iface.get_name == ggml_backend_metal_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||||
|
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
|
ggml_metal_set_n_cb(ctx, n_cb);
|
||||||
|
}
|
||||||
|
|
132
ggml-metal.metal
132
ggml-metal.metal
|
@ -132,6 +132,13 @@ kernel void kernel_relu(
|
||||||
dst[tpig] = max(0.0f, src0[tpig]);
|
dst[tpig] = max(0.0f, src0[tpig]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_sqr(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
dst[tpig] = src0[tpig] * src0[tpig];
|
||||||
|
}
|
||||||
|
|
||||||
constant float GELU_COEF_A = 0.044715f;
|
constant float GELU_COEF_A = 0.044715f;
|
||||||
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||||
|
|
||||||
|
@ -428,18 +435,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
|
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
|
||||||
uint3 tgpig, uint tiisg, uint sgitg) {
|
uint3 tgpig, uint tiisg, uint sgitg) {
|
||||||
const int nb = ne00/QK4_0;
|
const int nb = ne00/QK4_0;
|
||||||
|
|
||||||
const int r0 = tgpig.x;
|
const int r0 = tgpig.x;
|
||||||
const int r1 = tgpig.y;
|
const int r1 = tgpig.y;
|
||||||
const int im = tgpig.z;
|
const int im = tgpig.z;
|
||||||
|
|
||||||
const int first_row = (r0 * nsg + sgitg) * nr;
|
const int first_row = (r0 * nsg + sgitg) * nr;
|
||||||
|
|
||||||
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
|
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
|
||||||
|
|
||||||
device const block_q_type * x = (device const block_q_type *) src0 + offset0;
|
device const block_q_type * x = (device const block_q_type *) src0 + offset0;
|
||||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||||
|
|
||||||
float yl[16]; // src1 vector cache
|
float yl[16]; // src1 vector cache
|
||||||
float sumf[nr] = {0.f};
|
float sumf[nr] = {0.f};
|
||||||
|
|
||||||
const int ix = tiisg/2;
|
const int ix = (tiisg/2);
|
||||||
const int il = 8*(tiisg%2);
|
const int il = (tiisg%2)*8;
|
||||||
|
|
||||||
device const float * yb = y + ix * QK4_0 + il;
|
device const float * yb = y + ix * QK4_0 + il;
|
||||||
|
|
||||||
|
@ -450,6 +462,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||||
sumy += yb[i] + yb[i+1];
|
sumy += yb[i] + yb[i+1];
|
||||||
yl[i+0] = yb[i+ 0];
|
yl[i+0] = yb[i+ 0];
|
||||||
yl[i+1] = yb[i+ 1]/256.f;
|
yl[i+1] = yb[i+ 1]/256.f;
|
||||||
|
|
||||||
sumy += yb[i+16] + yb[i+17];
|
sumy += yb[i+16] + yb[i+17];
|
||||||
yl[i+8] = yb[i+16]/16.f;
|
yl[i+8] = yb[i+16]/16.f;
|
||||||
yl[i+9] = yb[i+17]/4096.f;
|
yl[i+9] = yb[i+17]/4096.f;
|
||||||
|
@ -465,12 +478,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||||
for (int row = 0; row < nr; ++row) {
|
for (int row = 0; row < nr; ++row) {
|
||||||
const float tot = simd_sum(sumf[row]);
|
const float tot = simd_sum(sumf[row]);
|
||||||
if (tiisg == 0 && first_row + row < ne01) {
|
if (tiisg == 0 && first_row + row < ne01) {
|
||||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
|
dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q4_0_f32(
|
kernel void kernel_mul_mv_q4_0_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -488,7 +501,7 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||||
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q4_1_f32(
|
kernel void kernel_mul_mv_q4_1_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -508,7 +521,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||||
|
|
||||||
#define NB_Q8_0 8
|
#define NB_Q8_0 8
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q8_0_f32(
|
kernel void kernel_mul_mv_q8_0_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -572,7 +585,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
||||||
|
|
||||||
#define N_F32_F32 4
|
#define N_F32_F32 4
|
||||||
|
|
||||||
kernel void kernel_mul_mat_f32_f32(
|
kernel void kernel_mul_mv_f32_f32(
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device const char * src1,
|
device const char * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -643,7 +656,7 @@ kernel void kernel_mul_mat_f32_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_f16_f32_1row(
|
kernel void kernel_mul_mv_f16_f32_1row(
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device const char * src1,
|
device const char * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -697,7 +710,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
||||||
|
|
||||||
#define N_F16_F32 4
|
#define N_F16_F32 4
|
||||||
|
|
||||||
kernel void kernel_mul_mat_f16_f32(
|
kernel void kernel_mul_mv_f16_f32(
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device const char * src1,
|
device const char * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -769,7 +782,7 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assumes row size (ne00) is a multiple of 4
|
// Assumes row size (ne00) is a multiple of 4
|
||||||
kernel void kernel_mul_mat_f16_f32_l4(
|
kernel void kernel_mul_mv_f16_f32_l4(
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device const char * src1,
|
device const char * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -831,6 +844,8 @@ kernel void kernel_alibi_f32(
|
||||||
constant uint64_t & nb2,
|
constant uint64_t & nb2,
|
||||||
constant uint64_t & nb3,
|
constant uint64_t & nb3,
|
||||||
constant float & m0,
|
constant float & m0,
|
||||||
|
constant float & m1,
|
||||||
|
constant int & n_heads_log2_floor,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
@ -846,7 +861,12 @@ kernel void kernel_alibi_f32(
|
||||||
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
||||||
|
|
||||||
device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
float m_k = pow(m0, i2 + 1);
|
float m_k;
|
||||||
|
if (i2 < n_heads_log2_floor) {
|
||||||
|
m_k = pow(m0, i2 + 1);
|
||||||
|
} else {
|
||||||
|
m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
|
||||||
|
}
|
||||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||||
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||||
dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
|
dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
|
||||||
|
@ -1091,6 +1111,62 @@ kernel void kernel_cpy_f32_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_concat(
|
||||||
|
device const char * src0,
|
||||||
|
device const char * src1,
|
||||||
|
device char * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
|
constant int64_t & ne03,
|
||||||
|
constant uint64_t & nb00,
|
||||||
|
constant uint64_t & nb01,
|
||||||
|
constant uint64_t & nb02,
|
||||||
|
constant uint64_t & nb03,
|
||||||
|
constant int64_t & ne10,
|
||||||
|
constant int64_t & ne11,
|
||||||
|
constant int64_t & ne12,
|
||||||
|
constant int64_t & ne13,
|
||||||
|
constant uint64_t & nb10,
|
||||||
|
constant uint64_t & nb11,
|
||||||
|
constant uint64_t & nb12,
|
||||||
|
constant uint64_t & nb13,
|
||||||
|
constant int64_t & ne0,
|
||||||
|
constant int64_t & ne1,
|
||||||
|
constant int64_t & ne2,
|
||||||
|
constant int64_t & ne3,
|
||||||
|
constant uint64_t & nb0,
|
||||||
|
constant uint64_t & nb1,
|
||||||
|
constant uint64_t & nb2,
|
||||||
|
constant uint64_t & nb3,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
|
const int64_t i03 = tgpig.z;
|
||||||
|
const int64_t i02 = tgpig.y;
|
||||||
|
const int64_t i01 = tgpig.x;
|
||||||
|
|
||||||
|
const int64_t i13 = i03 % ne13;
|
||||||
|
const int64_t i12 = i02 % ne12;
|
||||||
|
const int64_t i11 = i01 % ne11;
|
||||||
|
|
||||||
|
device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
|
||||||
|
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
||||||
|
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
||||||
|
|
||||||
|
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
|
if (i02 < ne02) {
|
||||||
|
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
|
||||||
|
src0_ptr += ntg.x*nb00;
|
||||||
|
} else {
|
||||||
|
((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
|
||||||
|
src1_ptr += ntg.x*nb10;
|
||||||
|
}
|
||||||
|
dst_ptr += ntg.x*nb0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//============================================ k-quants ======================================================
|
//============================================ k-quants ======================================================
|
||||||
|
|
||||||
#ifndef QK_K
|
#ifndef QK_K
|
||||||
|
@ -1183,7 +1259,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||||
|
|
||||||
//====================================== dot products =========================
|
//====================================== dot products =========================
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q2_K_f32(
|
kernel void kernel_mul_mv_q2_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1327,7 +1403,7 @@ kernel void kernel_mul_mat_q2_K_f32(
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
kernel void kernel_mul_mat_q3_K_f32(
|
kernel void kernel_mul_mv_q3_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1479,7 +1555,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
kernel void kernel_mul_mat_q3_K_f32(
|
kernel void kernel_mul_mv_q3_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1550,7 +1626,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
kernel void kernel_mul_mat_q4_K_f32(
|
kernel void kernel_mul_mv_q4_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1656,7 +1732,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
kernel void kernel_mul_mat_q4_K_f32(
|
kernel void kernel_mul_mv_q4_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1745,7 +1821,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q5_K_f32(
|
kernel void kernel_mul_mv_q5_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1918,7 +1994,7 @@ kernel void kernel_mul_mat_q5_K_f32(
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q6_K_f32(
|
kernel void kernel_mul_mv_q6_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -2256,7 +2332,7 @@ kernel void kernel_get_rows(
|
||||||
}
|
}
|
||||||
|
|
||||||
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
||||||
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
|
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
|
||||||
#define BLOCK_SIZE_K 32
|
#define BLOCK_SIZE_K 32
|
||||||
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
||||||
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
||||||
|
@ -2293,9 +2369,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
const uint r0 = tgpig.y;
|
const uint r0 = tgpig.y;
|
||||||
const uint r1 = tgpig.x;
|
const uint r1 = tgpig.x;
|
||||||
const uint im = tgpig.z;
|
const uint im = tgpig.z;
|
||||||
|
|
||||||
// if this block is of 64x32 shape or smaller
|
// if this block is of 64x32 shape or smaller
|
||||||
short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||||
short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||||
|
|
||||||
// a thread shouldn't load data outside of the matrix
|
// a thread shouldn't load data outside of the matrix
|
||||||
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||||
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||||
|
@ -2323,22 +2401,26 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
half4x4 temp_a;
|
half4x4 temp_a;
|
||||||
dequantize_func(x, il, temp_a);
|
dequantize_func(x, il, temp_a);
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
#pragma unroll(16)
|
#pragma unroll(16)
|
||||||
for (int i = 0; i < 16; i++) {
|
for (int i = 0; i < 16; i++) {
|
||||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||||
+ 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
|
+ (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
|
||||||
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
||||||
}
|
}
|
||||||
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
|
|
||||||
= *((device float2x4 *)y);
|
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
|
||||||
|
|
||||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||||
x = (il < 2) ? x + (2+nl-1)/nl : x;
|
x = (il < 2) ? x + (2+nl-1)/nl : x;
|
||||||
y += BLOCK_SIZE_K;
|
y += BLOCK_SIZE_K;
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// load matrices from threadgroup memory and conduct outer products
|
// load matrices from threadgroup memory and conduct outer products
|
||||||
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
||||||
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
|
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
|
||||||
|
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
|
for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
|
||||||
#pragma unroll(4)
|
#pragma unroll(4)
|
||||||
|
@ -2353,6 +2435,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
|
|
||||||
lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
|
lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
|
||||||
lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
|
lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
|
||||||
|
|
||||||
#pragma unroll(8)
|
#pragma unroll(8)
|
||||||
for (int i = 0; i < 8; i++){
|
for (int i = 0; i < 8; i++){
|
||||||
simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
|
simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
|
||||||
|
@ -2361,7 +2444,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
|
if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
|
||||||
device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
|
device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \
|
||||||
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
|
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
|
||||||
for (int i = 0; i < 8; i++) {
|
for (int i = 0; i < 8; i++) {
|
||||||
simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
|
simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
|
||||||
|
@ -2376,7 +2459,8 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
|
||||||
|
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||||
if (sgitg == 0) {
|
if (sgitg == 0) {
|
||||||
for (int i = 0; i < n_rows; i++) {
|
for (int i = 0; i < n_rows; i++) {
|
||||||
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
||||||
|
|
168
ggml-opencl.cpp
168
ggml-opencl.cpp
|
@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
|
||||||
|
|
||||||
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
|
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
|
||||||
{
|
{
|
||||||
const int i = get_group_id(0);
|
const int i = get_group_id(0) + get_global_offset(0);
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
const int n = tid / 32;
|
const int n = tid / 32;
|
||||||
const int l = tid - 32 * n;
|
const int l = tid - 32 * n;
|
||||||
const int is = 8 * n + l / 16;
|
const int is = 8 * n + l / 16;
|
||||||
|
|
||||||
const uint8_t q = x[i].qs[32 * n + l];
|
const uint8_t q = x[i].qs[32 * n + l];
|
||||||
__global float *y = yy + i * QK_K + 128 * n;
|
__global float *y = yy + get_group_id(0) * QK_K + 128 * n;
|
||||||
|
|
||||||
const float dall = vload_half(0, &x[i].d);
|
const float dall = vload_half(0, &x[i].d);
|
||||||
const float dmin = vload_half(0, &x[i].dmin);
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
||||||
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
|
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
|
||||||
{
|
{
|
||||||
int r = get_local_id(0) / 4;
|
int r = get_local_id(0) / 4;
|
||||||
int i = get_group_id(0);
|
int i = get_group_id(0) + get_global_offset(0);
|
||||||
int tid = r / 2;
|
int tid = r / 2;
|
||||||
int is0 = r % 2;
|
int is0 = r % 2;
|
||||||
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
|
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
|
||||||
|
@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
||||||
float d_all = vload_half(0, &x[i].d);
|
float d_all = vload_half(0, &x[i].d);
|
||||||
float dl = d_all * (us - 32);
|
float dl = d_all * (us - 32);
|
||||||
|
|
||||||
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
|
__global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
|
||||||
const __global uint8_t *q = x[i].qs + 32 * n;
|
const __global uint8_t *q = x[i].qs + 32 * n;
|
||||||
const __global uint8_t *hm = x[i].hmask;
|
const __global uint8_t *hm = x[i].hmask;
|
||||||
|
|
||||||
|
@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
||||||
|
|
||||||
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
|
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
|
||||||
{
|
{
|
||||||
const int i = get_group_id(0);
|
const int i = get_group_id(0) + get_global_offset(0);
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
const int il = tid / 8;
|
const int il = tid / 8;
|
||||||
const int ir = tid % 8;
|
const int ir = tid % 8;
|
||||||
const int is = 2 * il;
|
const int is = 2 * il;
|
||||||
const int n = 4;
|
const int n = 4;
|
||||||
|
|
||||||
__global float *y = yy + i * QK_K + 64 * il + n * ir;
|
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
|
||||||
|
|
||||||
const float dall = vload_half(0, &x[i].d);
|
const float dall = vload_half(0, &x[i].d);
|
||||||
const float dmin = vload_half(0, &x[i].dmin);
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
||||||
|
|
||||||
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
|
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
|
||||||
{
|
{
|
||||||
const int i = get_group_id(0);
|
const int i = get_group_id(0) + get_global_offset(0);
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
const int il = tid / 16;
|
const int il = tid / 16;
|
||||||
const int ir = tid % 16;
|
const int ir = tid % 16;
|
||||||
const int is = 2 * il;
|
const int is = 2 * il;
|
||||||
|
|
||||||
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
|
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
|
||||||
|
|
||||||
const float dall = vload_half(0, &x[i].d);
|
const float dall = vload_half(0, &x[i].d);
|
||||||
const float dmin = vload_half(0, &x[i].dmin);
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
||||||
|
|
||||||
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
|
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
|
||||||
{
|
{
|
||||||
const int i = get_group_id(0);
|
const int i = get_group_id(0) + get_global_offset(0);
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
const int ip = tid / 32;
|
const int ip = tid / 32;
|
||||||
const int il = tid - 32 * ip;
|
const int il = tid - 32 * ip;
|
||||||
const int is = 8 * ip + il / 16;
|
const int is = 8 * ip + il / 16;
|
||||||
|
|
||||||
__global float *y = yy + i * QK_K + 128 * ip + il;
|
__global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
|
||||||
|
|
||||||
const float d = vload_half(0, &x[i].d);
|
const float d = vload_half(0, &x[i].d);
|
||||||
|
|
||||||
|
@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
||||||
const uint qk = QUANT_K;
|
const uint qk = QUANT_K;
|
||||||
const uint qr = QUANT_R;
|
const uint qr = QUANT_R;
|
||||||
|
|
||||||
const int ib = i/qk; // block index
|
const int ib = i/qk + get_global_offset(0); // block index
|
||||||
const int iqs = (i%qk)/qr; // quant index
|
const int iqs = (i%qk)/qr; // quant index
|
||||||
const int iybs = i - i%qk; // y block start index
|
const int iybs = i - i%qk; // y block start index
|
||||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||||
|
@ -1349,31 +1349,43 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
||||||
const enum ggml_type type = src->type;
|
const enum ggml_type type = src->type;
|
||||||
const size_t ts = ggml_type_size(type);
|
const size_t ts = ggml_type_size(type);
|
||||||
const size_t bs = ggml_blck_size(type);
|
const size_t bs = ggml_blck_size(type);
|
||||||
|
const uint64_t row_size = ts*ne0/bs;
|
||||||
|
|
||||||
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
|
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
|
||||||
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
if (nb0 == ts && nb1 == row_size) {
|
||||||
err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
|
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
|
||||||
return err;
|
|
||||||
}
|
}
|
||||||
if (nb0 == ts) {
|
if (nb0 == ts) {
|
||||||
const size_t buffer_origin[3] = { offset, 0, 0 };
|
const size_t buffer_origin[3] = { offset, 0, 0 };
|
||||||
const size_t host_origin[3] = { 0, 0, 0 };
|
const size_t host_origin[3] = { 0, 0, 0 };
|
||||||
const size_t region[3] = { ts*ne0/bs, ne1, 1 };
|
const size_t region[3] = { row_size, ne1, 1 };
|
||||||
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
|
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
|
||||||
return err;
|
|
||||||
}
|
}
|
||||||
|
std::vector<cl_event> events;
|
||||||
|
if (ev && ne1>1) events.reserve(ne1-1);
|
||||||
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
// pretend the row is a matrix with cols=1
|
// pretend the row is a matrix with cols=1
|
||||||
const size_t buffer_origin[3] = { offset, i1, 0 };
|
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
|
||||||
const size_t host_origin[3] = { 0, 0, 0 };
|
const size_t host_origin[3] = { 0, 0, 0 };
|
||||||
const size_t region[3] = { ts/bs, ne0, 1 };
|
const size_t region[3] = { ts, ne0/bs, 1 };
|
||||||
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
|
// if an event is requested, make the last write wait for all previous writes to complete
|
||||||
if (err != CL_SUCCESS) {
|
if (ev && i1) {
|
||||||
break;
|
events.push_back(*ev);
|
||||||
}
|
}
|
||||||
|
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
|
||||||
|
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
for (auto event : events) {
|
||||||
|
clReleaseEvent(event);
|
||||||
}
|
}
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
for (auto event : events) {
|
||||||
|
CL_CHECK(clReleaseEvent(event));
|
||||||
|
}
|
||||||
|
return CL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||||
|
@ -1476,10 +1488,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
const int64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
const int64_t r2 = ne12 / ne02;
|
||||||
|
const int64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
const float alpha = 1.0f;
|
const float alpha = 1.0f;
|
||||||
const float beta = 0.0f;
|
const float beta = 0.0f;
|
||||||
const int x_ne = ne01 * ne00;
|
const int x_ne = ne01 * ne00;
|
||||||
|
@ -1498,13 +1515,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
size_t x_offset = 0;
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
int64_t pi02 = -1;
|
||||||
|
int64_t pi03 = -1;
|
||||||
|
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
|
int64_t i03 = i13 / r3;
|
||||||
|
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
|
int64_t i02 = i12 / r2;
|
||||||
|
|
||||||
// copy data to device
|
// copy data to device
|
||||||
if (src0->backend != GGML_BACKEND_GPU) {
|
if (src0->backend == GGML_BACKEND_GPU) {
|
||||||
|
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||||
|
} else if (i02 != pi02 || i03 != pi03) {
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||||
|
pi02 = i02;
|
||||||
|
pi03 = i03;
|
||||||
}
|
}
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||||
|
|
||||||
CL_CHECK(clFinish(queue));
|
CL_CHECK(clFinish(queue));
|
||||||
|
|
||||||
|
@ -1514,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
alpha,
|
alpha,
|
||||||
d_X, 0, ne00,
|
d_X, x_offset, ne00,
|
||||||
d_Y, 0, ne10,
|
d_Y, 0, ne10,
|
||||||
beta,
|
beta,
|
||||||
d_D, 0, ne01,
|
d_D, 0, ne01,
|
||||||
|
@ -1525,7 +1554,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1547,6 +1576,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
const int64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
const int nb10 = src1->nb[0];
|
const int nb10 = src1->nb[0];
|
||||||
const int nb11 = src1->nb[1];
|
const int nb11 = src1->nb[1];
|
||||||
|
@ -1556,6 +1587,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
const int64_t r2 = ne12 / ne02;
|
||||||
|
const int64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
||||||
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
||||||
const int x_ne = ne01 * ne00;
|
const int x_ne = ne01 * ne00;
|
||||||
|
@ -1577,32 +1611,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
bool src1_cont_rows = nb10 == sizeof(float);
|
bool src1_cont_rows = nb10 == sizeof(float);
|
||||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
size_t x_offset = 0;
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
int64_t pi02 = -1;
|
||||||
|
int64_t pi03 = -1;
|
||||||
|
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
|
int64_t i03 = i13 / r3;
|
||||||
|
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
|
int64_t i02 = i12 / r2;
|
||||||
|
|
||||||
// copy src0 to device
|
// copy src0 to device
|
||||||
if (src0->backend != GGML_BACKEND_GPU) {
|
if (src0->backend == GGML_BACKEND_GPU) {
|
||||||
|
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||||
|
} else if (i02 != pi02 || i03 != pi03) {
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||||
|
pi02 = i02;
|
||||||
|
pi03 = i03;
|
||||||
}
|
}
|
||||||
|
|
||||||
// convert src1 to fp16
|
// convert src1 to fp16
|
||||||
// TODO: use multiple threads
|
// TODO: use multiple threads
|
||||||
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
||||||
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
|
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
||||||
if (src1_cont_rows) {
|
if (src1_cont_rows) {
|
||||||
if (src1_cont_cols) {
|
if (src1_cont_cols) {
|
||||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||||
ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
|
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||||
for (int64_t i00 = 0; i00 < ne10; i00++) {
|
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
||||||
// very slow due to no inlining
|
// very slow due to no inlining
|
||||||
tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
|
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1618,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
alpha,
|
alpha,
|
||||||
d_X, 0, ne00,
|
d_X, x_offset, ne00,
|
||||||
d_Y, 0, ne10,
|
d_Y, 0, ne10,
|
||||||
beta,
|
beta,
|
||||||
d_D, 0, ne01,
|
d_D, 0, ne01,
|
||||||
|
@ -1631,7 +1677,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
// copy dst to host, then convert to float
|
// copy dst to host, then convert to float
|
||||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||||
|
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
|
|
||||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||||
}
|
}
|
||||||
|
@ -1652,18 +1698,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
const int64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
const ggml_type type = src0->type;
|
const ggml_type type = src0->type;
|
||||||
const bool mul_mat_vec = ne11 == 1;
|
const bool mul_mat_vec = ne11 == 1;
|
||||||
|
|
||||||
|
const int64_t r2 = ne12 / ne02;
|
||||||
|
const int64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
const float alpha = 1.0f;
|
const float alpha = 1.0f;
|
||||||
const float beta = 0.0f;
|
const float beta = 0.0f;
|
||||||
const int x_ne = ne01 * ne00;
|
const int x_ne = ne01 * ne00;
|
||||||
const int y_ne = ne11 * ne10;
|
const int y_ne = ne11 * ne10;
|
||||||
const int d_ne = ne11 * ne01;
|
const int d_ne = ne11 * ne01;
|
||||||
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
|
const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
|
||||||
|
const size_t q_sz = ggml_type_size(type) * x_bps;
|
||||||
|
|
||||||
size_t x_size;
|
size_t x_size;
|
||||||
size_t y_size;
|
size_t y_size;
|
||||||
|
@ -1690,12 +1742,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
size_t ev_idx = 0;
|
size_t ev_idx = 0;
|
||||||
std::vector<cl_event> events;
|
std::vector<cl_event> events;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
int64_t pi02 = -1;
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
int64_t pi03 = -1;
|
||||||
|
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
|
int64_t i03 = i13 / r3;
|
||||||
|
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
|
int64_t i02 = i12 / r2;
|
||||||
|
|
||||||
// copy src0 to device if necessary
|
// copy src0 to device if necessary
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (src0->backend == GGML_BACKEND_CPU) {
|
||||||
|
if (i02 != pi02 || i03 != pi03) {
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||||
|
pi02 = i02;
|
||||||
|
pi03 = i03;
|
||||||
|
}
|
||||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||||
d_Q = (cl_mem) src0->extra;
|
d_Q = (cl_mem) src0->extra;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1704,7 +1767,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||||
// copy src1 to device
|
// copy src1 to device
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
||||||
|
@ -1720,12 +1783,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
||||||
// convert src0 to fp32 on device
|
// convert src0 to fp32 on device
|
||||||
const size_t global = x_ne / global_denom;
|
const size_t global = x_ne / global_denom;
|
||||||
|
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||||
|
|
||||||
// copy src1 to device
|
// copy src1 to device
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||||
|
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
|
|
||||||
|
@ -1749,7 +1813,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
||||||
for (auto *event : events) {
|
for (auto *event : events) {
|
||||||
clReleaseEvent(event);
|
clReleaseEvent(event);
|
||||||
|
@ -1844,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
||||||
const int64_t ne3 = tensor->ne[3];
|
const int64_t ne3 = tensor->ne[3];
|
||||||
|
|
||||||
const ggml_type type = tensor->type;
|
const ggml_type type = tensor->type;
|
||||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
|
||||||
|
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
|
||||||
|
|
||||||
size_t q_size;
|
size_t q_size;
|
||||||
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||||
|
|
||||||
tensor->data = data;
|
tensor->data = data;
|
||||||
// copy tensor to device
|
// copy tensor to device
|
||||||
|
size_t offset = 0;
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
int i = i3*ne2 + i2;
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
|
offset += s_sz;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
27
ggml.h
27
ggml.h
|
@ -326,7 +326,7 @@ extern "C" {
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_backend {
|
enum ggml_backend_type {
|
||||||
GGML_BACKEND_CPU = 0,
|
GGML_BACKEND_CPU = 0,
|
||||||
GGML_BACKEND_GPU = 10,
|
GGML_BACKEND_GPU = 10,
|
||||||
GGML_BACKEND_GPU_SPLIT = 20,
|
GGML_BACKEND_GPU_SPLIT = 20,
|
||||||
|
@ -401,10 +401,14 @@ extern "C" {
|
||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_1D,
|
GGML_OP_CONV_1D,
|
||||||
GGML_OP_CONV_2D,
|
GGML_OP_CONV_2D,
|
||||||
|
GGML_OP_CONV_TRANSPOSE_1D,
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
GGML_OP_POOL_2D,
|
GGML_OP_POOL_2D,
|
||||||
|
|
||||||
|
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||||
|
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||||
|
|
||||||
GGML_OP_UPSCALE, // nearest interpolate
|
GGML_OP_UPSCALE, // nearest interpolate
|
||||||
|
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
|
@ -476,7 +480,9 @@ extern "C" {
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
enum ggml_backend backend;
|
enum ggml_backend_type backend;
|
||||||
|
|
||||||
|
struct ggml_backend_buffer * buffer;
|
||||||
|
|
||||||
int n_dims;
|
int n_dims;
|
||||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
|
@ -510,7 +516,7 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[4];
|
char padding[12];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -1354,7 +1360,7 @@ extern "C" {
|
||||||
|
|
||||||
// alibi position embedding
|
// alibi position embedding
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
struct ggml_tensor * ggml_alibi(
|
GGML_API struct ggml_tensor * ggml_alibi(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_past,
|
int n_past,
|
||||||
|
@ -1363,7 +1369,7 @@ extern "C" {
|
||||||
|
|
||||||
// clamp
|
// clamp
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
struct ggml_tensor * ggml_clamp(
|
GGML_API struct ggml_tensor * ggml_clamp(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
float min,
|
float min,
|
||||||
|
@ -1386,6 +1392,14 @@ extern "C" {
|
||||||
int s,
|
int s,
|
||||||
int d);
|
int d);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
int s0,
|
||||||
|
int p0,
|
||||||
|
int d0);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_2d(
|
GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -1759,6 +1773,7 @@ extern "C" {
|
||||||
GGML_OPT_NO_CONTEXT,
|
GGML_OPT_NO_CONTEXT,
|
||||||
GGML_OPT_INVALID_WOLFE,
|
GGML_OPT_INVALID_WOLFE,
|
||||||
GGML_OPT_FAIL,
|
GGML_OPT_FAIL,
|
||||||
|
GGML_OPT_CANCEL,
|
||||||
|
|
||||||
GGML_LINESEARCH_FAIL = -128,
|
GGML_LINESEARCH_FAIL = -128,
|
||||||
GGML_LINESEARCH_MINIMUM_STEP,
|
GGML_LINESEARCH_MINIMUM_STEP,
|
||||||
|
@ -2089,7 +2104,7 @@ extern "C" {
|
||||||
enum ggml_type vec_dot_type;
|
enum ggml_type vec_dot_type;
|
||||||
} ggml_type_traits_t;
|
} ggml_type_traits_t;
|
||||||
|
|
||||||
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,4 +69,3 @@ python -m twine upload dist/*
|
||||||
## TODO
|
## TODO
|
||||||
- [ ] Add tests
|
- [ ] Add tests
|
||||||
- [ ] Include conversion scripts as command line entry points in this package.
|
- [ ] Include conversion scripts as command line entry points in this package.
|
||||||
- Add CI workflow for releasing the package.
|
|
||||||
|
|
|
@ -85,10 +85,14 @@ class MODEL_ARCH(IntEnum):
|
||||||
GPTNEOX : int = auto()
|
GPTNEOX : int = auto()
|
||||||
MPT : int = auto()
|
MPT : int = auto()
|
||||||
STARCODER : int = auto()
|
STARCODER : int = auto()
|
||||||
|
PERSIMMON : int = auto()
|
||||||
|
REFACT : int = auto()
|
||||||
|
BERT : int = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD : int = auto()
|
TOKEN_EMBD : int = auto()
|
||||||
|
TOKEN_TYPES : int = auto()
|
||||||
POS_EMBD : int = auto()
|
POS_EMBD : int = auto()
|
||||||
OUTPUT : int = auto()
|
OUTPUT : int = auto()
|
||||||
OUTPUT_NORM : int = auto()
|
OUTPUT_NORM : int = auto()
|
||||||
|
@ -105,6 +109,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_DOWN : int = auto()
|
FFN_DOWN : int = auto()
|
||||||
FFN_UP : int = auto()
|
FFN_UP : int = auto()
|
||||||
FFN_NORM : int = auto()
|
FFN_NORM : int = auto()
|
||||||
|
ATTN_Q_NORM : int = auto()
|
||||||
|
ATTN_K_NORM : int = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -116,78 +122,169 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||||
MODEL_ARCH.MPT: "mpt",
|
MODEL_ARCH.MPT: "mpt",
|
||||||
MODEL_ARCH.STARCODER: "starcoder",
|
MODEL_ARCH.STARCODER: "starcoder",
|
||||||
|
MODEL_ARCH.PERSIMMON: "persimmon",
|
||||||
|
MODEL_ARCH.REFACT: "refact",
|
||||||
|
MODEL_ARCH.BERT: "bert",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_ARCH.LLAMA: {
|
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
|
||||||
},
|
|
||||||
MODEL_ARCH.GPTNEOX: {
|
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
|
||||||
},
|
|
||||||
MODEL_ARCH.FALCON: {
|
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
|
||||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
|
||||||
},
|
|
||||||
MODEL_ARCH.BAICHUAN: {
|
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
|
||||||
},
|
|
||||||
MODEL_ARCH.STARCODER: {
|
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||||
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
},
|
}
|
||||||
MODEL_ARCH.GPT2: {
|
|
||||||
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
MODEL_ARCH.LLAMA: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.GPTNEOX: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.FALCON: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.BAICHUAN: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.STARCODER: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.BERT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.MPT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.GPTJ: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.PERSIMMON: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.REFACT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.GPT2: [
|
||||||
# TODO
|
# TODO
|
||||||
},
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,6 +298,9 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.PERSIMMON: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -209,15 +309,23 @@ class TensorNameMap:
|
||||||
# Token embeddings
|
# Token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD: (
|
MODEL_TENSOR.TOKEN_EMBD: (
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 mpt
|
"transformer.wte", # gpt2 gpt-j mpt refact
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
|
"embeddings.word_embeddings", # bert
|
||||||
|
"language_model.embedding.word_embeddings", # persimmon
|
||||||
|
),
|
||||||
|
|
||||||
|
# Token type embeddings
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
|
"embeddings.token_type_embeddings", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
MODEL_TENSOR.POS_EMBD: (
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
"transformer.wpe", # gpt2
|
"transformer.wpe", # gpt2
|
||||||
|
"embeddings.position_embeddings", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
|
@ -225,14 +333,19 @@ class TensorNameMap:
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||||
"output", # llama-pth
|
"output", # llama-pth
|
||||||
|
"word_embeddings_for_head", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output norm
|
# Output norm
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
"transformer.ln_f", # gpt2 falcon
|
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||||
"model.norm", # llama-hf baichuan
|
"model.norm", # llama-hf baichuan
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
|
"embeddings.LayerNorm", # bert
|
||||||
|
"transformer.norm_f", # mpt
|
||||||
|
"ln_f", # refact
|
||||||
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
|
@ -245,12 +358,14 @@ class TensorNameMap:
|
||||||
# Attention norm
|
# Attention norm
|
||||||
MODEL_TENSOR.ATTN_NORM: (
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_1", # gpt2
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
|
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
@ -264,34 +379,44 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
MODEL_TENSOR.ATTN_Q: (
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wq", # llama-pth
|
"layers.{bid}.attention.wq", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.query", # bert
|
||||||
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wk", # llama-pth
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wv", # llama-pth
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2
|
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.dense" # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
|
@ -303,10 +428,12 @@ class TensorNameMap:
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2
|
"transformer.h.{bid}.ln_2", # gpt2 refact
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
|
@ -315,51 +442,65 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||||
"model.layers.{bid}.mlp.up_proj", # llama-hf
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward gate
|
# Feed-forward gate
|
||||||
MODEL_TENSOR.FFN_GATE: (
|
MODEL_TENSOR.FFN_GATE: (
|
||||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2
|
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||||
|
"encoder.layer.{bid}.output.dense", # bert
|
||||||
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||||
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||||
|
|
||||||
tensor_names: dict[MODEL_TENSOR, str]
|
|
||||||
|
|
||||||
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
||||||
mapping = self.mapping = {}
|
self.mapping = {}
|
||||||
tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
|
|
||||||
for tensor, keys in self.mappings_cfg.items():
|
for tensor, keys in self.mappings_cfg.items():
|
||||||
tensor_name = tensor_names.get(tensor)
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
if tensor_name is None:
|
|
||||||
continue
|
continue
|
||||||
mapping[tensor_name] = (tensor, tensor_name)
|
tensor_name = TENSOR_NAMES[tensor]
|
||||||
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
mapping[key] = (tensor, tensor_name)
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
for bid in range(n_blocks):
|
for bid in range(n_blocks):
|
||||||
for tensor, keys in self.block_mappings_cfg.items():
|
for tensor, keys in self.block_mappings_cfg.items():
|
||||||
tensor_name = tensor_names.get(tensor)
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
if tensor_name is None:
|
|
||||||
continue
|
continue
|
||||||
tensor_name = tensor_name.format(bid = bid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
||||||
mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
key = key.format(bid = bid)
|
key = key.format(bid = bid)
|
||||||
mapping[key] = (tensor, tensor_name)
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
|
||||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||||
result = self.mapping.get(key)
|
result = self.mapping.get(key)
|
||||||
|
@ -800,22 +941,25 @@ class SpecialVocab:
|
||||||
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
|
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
|
||||||
special_token_ids: dict[str, int] = {}
|
special_token_ids: dict[str, int] = {}
|
||||||
|
|
||||||
def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
|
def __init__(
|
||||||
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||||
|
special_token_types: tuple[str, ...] | None = None,
|
||||||
|
):
|
||||||
self.special_token_ids = {}
|
self.special_token_ids = {}
|
||||||
self.load_merges = load_merges
|
self.load_merges = load_merges
|
||||||
if special_token_types is not None:
|
if special_token_types is not None:
|
||||||
self.special_token_types = special_token_types
|
self.special_token_types = special_token_types
|
||||||
self.load(path)
|
self._load(Path(path))
|
||||||
|
|
||||||
def load(self, path: Path):
|
def _load(self, path: Path) -> None:
|
||||||
if not self.try_load_from_tokenizer_json(path):
|
if not self._try_load_from_tokenizer_json(path):
|
||||||
self.try_load_from_config_json(path)
|
self._try_load_from_config_json(path)
|
||||||
|
|
||||||
def try_load_from_tokenizer_json(self, path: Path) -> bool:
|
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||||
tokenizer_file = path / 'tokenizer.json'
|
tokenizer_file = path / 'tokenizer.json'
|
||||||
if not tokenizer_file.is_file():
|
if not tokenizer_file.is_file():
|
||||||
return False
|
return False
|
||||||
with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
|
with open(tokenizer_file, encoding = 'utf-8') as f:
|
||||||
tokenizer = json.load(f)
|
tokenizer = json.load(f)
|
||||||
if self.load_merges:
|
if self.load_merges:
|
||||||
merges = tokenizer.get('model', {}).get('merges')
|
merges = tokenizer.get('model', {}).get('merges')
|
||||||
|
@ -825,7 +969,7 @@ class SpecialVocab:
|
||||||
added_tokens = tokenizer.get('added_tokens')
|
added_tokens = tokenizer.get('added_tokens')
|
||||||
if added_tokens is None or not tokenizer_config_file.is_file():
|
if added_tokens is None or not tokenizer_config_file.is_file():
|
||||||
return True
|
return True
|
||||||
with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
entry = tokenizer_config.get(f'{typ}_token')
|
entry = tokenizer_config.get(f'{typ}_token')
|
||||||
|
@ -844,11 +988,11 @@ class SpecialVocab:
|
||||||
break
|
break
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def try_load_from_config_json(self, path: Path) -> bool:
|
def _try_load_from_config_json(self, path: Path) -> bool:
|
||||||
config_file = path / 'config.json'
|
config_file = path / 'config.json'
|
||||||
if not config_file.is_file():
|
if not config_file.is_file():
|
||||||
return False
|
return False
|
||||||
with open(config_file, 'r', encoding = 'utf-8') as f:
|
with open(config_file, encoding = 'utf-8') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
maybe_token_id = config.get(f'{typ}_token_id')
|
maybe_token_id = config.get(f'{typ}_token_id')
|
||||||
|
@ -856,7 +1000,7 @@ class SpecialVocab:
|
||||||
self.special_token_ids[typ] = maybe_token_id
|
self.special_token_ids[typ] = maybe_token_id
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def add_to_gguf(self, gw: GGUFWriter):
|
def add_to_gguf(self, gw: GGUFWriter) -> None:
|
||||||
if len(self.merges) > 0:
|
if len(self.merges) > 0:
|
||||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||||
gw.add_token_merges(self.merges)
|
gw.add_token_merges(self.merges)
|
||||||
|
@ -868,8 +1012,8 @@ class SpecialVocab:
|
||||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||||
handler(tokid)
|
handler(tokid)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
|
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids or "unset"}>'
|
||||||
|
|
||||||
|
|
||||||
# Example usage:
|
# Example usage:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.3.3"
|
version = "0.4.4"
|
||||||
description = "Write ML models in GGUF for GGML"
|
description = "Write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
746
k_quants.c
746
k_quants.c
|
@ -54,6 +54,10 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __riscv_v_intrinsic
|
||||||
|
#include <riscv_vector.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#undef MAX
|
#undef MAX
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
@ -65,7 +69,6 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||||
// 2-6 bit quantization in super-blocks
|
// 2-6 bit quantization in super-blocks
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// ===================== Helper functions
|
// ===================== Helper functions
|
||||||
//
|
//
|
||||||
|
@ -344,7 +347,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
||||||
const float q4scale = 15.f;
|
const float q4scale = 15.f;
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
float max_min = 0;
|
float max_min = 0;
|
||||||
for (int j = 0; j < QK_K/16; ++j) {
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
@ -1582,6 +1584,90 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc);
|
*s = hsum_float_8(acc);
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const uint8_t * q2 = x[i].qs;
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * sc = x[i].scales;
|
||||||
|
|
||||||
|
const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||||
|
const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
||||||
|
|
||||||
|
size_t vl = 16;
|
||||||
|
|
||||||
|
vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
|
||||||
|
vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
|
||||||
|
|
||||||
|
vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
|
||||||
|
|
||||||
|
vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
|
||||||
|
vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
|
||||||
|
vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
|
||||||
|
vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
|
||||||
|
vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
||||||
|
|
||||||
|
sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
|
||||||
|
|
||||||
|
vl = 32;
|
||||||
|
|
||||||
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
|
vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
|
||||||
|
|
||||||
|
uint8_t is=0;
|
||||||
|
int isum=0;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
// load Q2
|
||||||
|
vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
|
||||||
|
|
||||||
|
vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
|
||||||
|
vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
|
||||||
|
vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
|
||||||
|
vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
|
||||||
|
|
||||||
|
// duplicate scale elements for product
|
||||||
|
vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
|
||||||
|
vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
|
||||||
|
vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
|
||||||
|
vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
|
||||||
|
|
||||||
|
vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
|
||||||
|
vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
|
||||||
|
vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
|
||||||
|
vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
|
||||||
|
|
||||||
|
// load Q8
|
||||||
|
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
|
||||||
|
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
|
||||||
|
vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
|
||||||
|
vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
|
||||||
|
|
||||||
|
vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
|
||||||
|
vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
|
||||||
|
vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
|
||||||
|
vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
|
||||||
|
|
||||||
|
vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
|
||||||
|
vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
|
||||||
|
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(isum1);
|
||||||
|
|
||||||
|
q2+=32; q8+=128; is=8;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += dall * isum;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
@ -1807,6 +1893,64 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc) + summs;
|
*s = hsum_float_8(acc) + summs;
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
uint32_t aux32[2];
|
||||||
|
const uint8_t * scales = (const uint8_t *)aux32;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d = y[i].d * (float)x[i].d;
|
||||||
|
const float dmin = -y[i].d * (float)x[i].dmin;
|
||||||
|
|
||||||
|
const uint8_t * restrict q2 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
|
||||||
|
|
||||||
|
aux32[0] = sc[0] & 0x0f0f0f0f;
|
||||||
|
aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
|
||||||
|
|
||||||
|
sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
|
||||||
|
|
||||||
|
int isum1 = 0;
|
||||||
|
int isum2 = 0;
|
||||||
|
|
||||||
|
size_t vl = 16;
|
||||||
|
|
||||||
|
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
||||||
|
|
||||||
|
// load Q2
|
||||||
|
vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
|
||||||
|
|
||||||
|
vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
|
||||||
|
vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
|
||||||
|
vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
|
||||||
|
vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
|
||||||
|
|
||||||
|
// load Q8, and take product with Q2
|
||||||
|
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
||||||
|
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
||||||
|
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
||||||
|
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
||||||
|
|
||||||
|
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
|
||||||
|
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
|
||||||
|
vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
|
||||||
|
vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
|
||||||
|
|
||||||
|
isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
|
||||||
|
isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
|
||||||
|
isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
|
||||||
|
isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
|
||||||
|
|
||||||
|
sumf += d * (isum1 + isum2);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
@ -2220,6 +2364,106 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc);
|
*s = hsum_float_8(acc);
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
uint32_t aux[3];
|
||||||
|
uint32_t utmp[4];
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const uint8_t * restrict q3 = x[i].qs;
|
||||||
|
const uint8_t * restrict qh = x[i].hmask;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
memcpy(aux, x[i].scales, 12);
|
||||||
|
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
||||||
|
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
||||||
|
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
||||||
|
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
||||||
|
|
||||||
|
int8_t * scale = (int8_t *)utmp;
|
||||||
|
for (int j = 0; j < 16; ++j) scale[j] -= 32;
|
||||||
|
|
||||||
|
|
||||||
|
size_t vl = 32;
|
||||||
|
uint8_t m = 1;
|
||||||
|
|
||||||
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
|
vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
|
||||||
|
|
||||||
|
int sum_t = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K; j += 128) {
|
||||||
|
|
||||||
|
vl = 32;
|
||||||
|
|
||||||
|
// load Q3
|
||||||
|
vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
|
||||||
|
|
||||||
|
vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
|
||||||
|
vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
|
||||||
|
vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
|
||||||
|
vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
|
||||||
|
|
||||||
|
// compute mask for subtraction
|
||||||
|
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
|
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
||||||
|
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
|
||||||
|
m <<= 1;
|
||||||
|
|
||||||
|
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
|
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
||||||
|
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
|
||||||
|
m <<= 1;
|
||||||
|
|
||||||
|
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
|
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
||||||
|
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
|
||||||
|
m <<= 1;
|
||||||
|
|
||||||
|
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
|
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
||||||
|
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
|
||||||
|
m <<= 1;
|
||||||
|
|
||||||
|
// load Q8 and take product with Q3
|
||||||
|
vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
|
||||||
|
vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
||||||
|
vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
|
||||||
|
vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
|
||||||
|
|
||||||
|
vl = 16;
|
||||||
|
|
||||||
|
// retreive lane to multiply with scale
|
||||||
|
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
||||||
|
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
||||||
|
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
||||||
|
vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
|
||||||
|
vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
|
||||||
|
vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
|
||||||
|
vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
|
||||||
|
vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
|
||||||
|
|
||||||
|
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
|
||||||
|
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
|
||||||
|
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
|
||||||
|
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
|
||||||
|
|
||||||
|
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
|
||||||
|
|
||||||
|
q3 += 32; q8 += 128; scale += 8;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
||||||
|
|
||||||
|
sumf += d*sum_t;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// scalar version
|
// scalar version
|
||||||
// This function is written like this so the compiler can manage to vectorize most of it
|
// This function is written like this so the compiler can manage to vectorize most of it
|
||||||
|
@ -2523,6 +2767,79 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc);
|
*s = hsum_float_8(acc);
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
uint16_t aux16[2];
|
||||||
|
int8_t * scales = (int8_t *)aux16;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const uint8_t * restrict q3 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
const uint16_t a = *(const uint16_t *)x[i].scales;
|
||||||
|
aux16[0] = a & 0x0f0f;
|
||||||
|
aux16[1] = (a >> 4) & 0x0f0f;
|
||||||
|
|
||||||
|
for (int j = 0; j < 4; ++j) scales[j] -= 8;
|
||||||
|
|
||||||
|
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
||||||
|
|
||||||
|
const float d = y[i].d * (float)x[i].d;
|
||||||
|
|
||||||
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
|
|
||||||
|
// load qh
|
||||||
|
vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
|
||||||
|
vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
|
||||||
|
|
||||||
|
size_t vl = 16;
|
||||||
|
|
||||||
|
// extend and combine both qh_x1 and qh_x2
|
||||||
|
vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
|
||||||
|
|
||||||
|
vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
|
||||||
|
vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
|
||||||
|
vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
|
||||||
|
vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
|
||||||
|
|
||||||
|
// load Q3
|
||||||
|
vuint8mf2_t q3_x = __riscv_vle8_v_u8mf2(q3, vl);
|
||||||
|
|
||||||
|
vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
|
||||||
|
vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
|
||||||
|
vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
|
||||||
|
vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
|
||||||
|
|
||||||
|
vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
|
||||||
|
vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
|
||||||
|
vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
|
||||||
|
vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
|
||||||
|
|
||||||
|
// load Q8 and take product with Q3
|
||||||
|
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
||||||
|
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
||||||
|
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
||||||
|
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
||||||
|
|
||||||
|
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
||||||
|
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
||||||
|
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
||||||
|
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
||||||
|
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
|
||||||
|
|
||||||
|
sumf += d * isum;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
int8_t aux8[QK_K];
|
int8_t aux8[QK_K];
|
||||||
|
@ -2823,6 +3140,78 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||||
|
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
size_t vl = 8;
|
||||||
|
|
||||||
|
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||||
|
const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
||||||
|
|
||||||
|
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
||||||
|
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
||||||
|
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
||||||
|
|
||||||
|
memcpy(utmp, x[i].scales, 12);
|
||||||
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||||
|
const uint32_t uaux = utmp[1] & kmask1;
|
||||||
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||||
|
utmp[2] = uaux;
|
||||||
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
|
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
||||||
|
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
||||||
|
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
||||||
|
|
||||||
|
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
||||||
|
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
||||||
|
|
||||||
|
const uint8_t * restrict q4 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
vl = 32;
|
||||||
|
|
||||||
|
int32_t sum_1 = 0;
|
||||||
|
int32_t sum_2 = 0;
|
||||||
|
|
||||||
|
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
|
// load Q4
|
||||||
|
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
|
||||||
|
|
||||||
|
// load Q8 and multiply it with lower Q4 nibble
|
||||||
|
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
|
||||||
|
vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
|
||||||
|
vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
|
||||||
|
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
|
||||||
|
|
||||||
|
sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
|
||||||
|
|
||||||
|
// load Q8 and multiply it with upper Q4 nibble
|
||||||
|
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
|
||||||
|
vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
|
||||||
|
vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
|
||||||
|
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
|
||||||
|
|
||||||
|
sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
|
||||||
|
|
||||||
|
q4 += 32; q8 += 64;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += d*(sum_1 + sum_2);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
|
||||||
|
@ -3064,6 +3453,50 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc) - summs;
|
*s = hsum_float_8(acc) - summs;
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
uint16_t s16[2];
|
||||||
|
const uint8_t * restrict scales = (const uint8_t *)s16;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const uint8_t * restrict q4 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
const uint16_t * restrict b = (const uint16_t *)x[i].scales;
|
||||||
|
s16[0] = b[0] & 0x0f0f;
|
||||||
|
s16[1] = (b[0] >> 4) & 0x0f0f;
|
||||||
|
|
||||||
|
sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
|
||||||
|
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
|
||||||
|
|
||||||
|
size_t vl = 32;
|
||||||
|
|
||||||
|
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
||||||
|
|
||||||
|
// load Q4
|
||||||
|
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
|
||||||
|
|
||||||
|
// load Q8 and multiply it with lower Q4 nibble
|
||||||
|
vint8m1_t q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
|
||||||
|
vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
|
||||||
|
vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
|
||||||
|
|
||||||
|
sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
|
||||||
|
|
||||||
|
// load Q8 and multiply it with upper Q4 nibble
|
||||||
|
vint8m1_t q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
|
||||||
|
vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
||||||
|
vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
|
||||||
|
|
||||||
|
sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
uint8_t aux8[QK_K];
|
uint8_t aux8[QK_K];
|
||||||
|
@ -3394,6 +3827,93 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc) + summs;
|
*s = hsum_float_8(acc) + summs;
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||||
|
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
float sums = 0.0;
|
||||||
|
|
||||||
|
size_t vl;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
vl = 8;
|
||||||
|
|
||||||
|
const uint8_t * restrict q5 = x[i].qs;
|
||||||
|
const uint8_t * restrict hm = x[i].qh;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
||||||
|
const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
|
||||||
|
|
||||||
|
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
||||||
|
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
||||||
|
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
||||||
|
|
||||||
|
memcpy(utmp, x[i].scales, 12);
|
||||||
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||||
|
const uint32_t uaux = utmp[1] & kmask1;
|
||||||
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||||
|
utmp[2] = uaux;
|
||||||
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
|
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
||||||
|
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
||||||
|
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
||||||
|
|
||||||
|
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
||||||
|
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
||||||
|
|
||||||
|
vl = 32;
|
||||||
|
int32_t aux32 = 0;
|
||||||
|
int is = 0;
|
||||||
|
|
||||||
|
uint8_t m = 1;
|
||||||
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
|
vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
|
// load Q5 and Q8
|
||||||
|
vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
|
||||||
|
vint8m1_t q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
|
||||||
|
vint8m1_t q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
|
||||||
|
|
||||||
|
// compute mask for addition
|
||||||
|
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
||||||
|
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
|
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
||||||
|
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
|
||||||
|
m <<= 1;
|
||||||
|
|
||||||
|
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
||||||
|
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
||||||
|
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
||||||
|
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
|
||||||
|
m <<= 1;
|
||||||
|
|
||||||
|
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
||||||
|
vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
|
||||||
|
|
||||||
|
vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
|
||||||
|
vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
|
||||||
|
|
||||||
|
vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
|
||||||
|
vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
|
||||||
|
|
||||||
|
aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
|
||||||
|
q5 += 32; q8 += 64;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
|
||||||
|
sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf+sums;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||||
|
@ -3639,6 +4159,76 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc);
|
*s = hsum_float_8(acc);
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d = y[i].d * (float)x[i].d;
|
||||||
|
const int8_t * sc = x[i].scales;
|
||||||
|
|
||||||
|
const uint8_t * restrict q5 = x[i].qs;
|
||||||
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
|
|
||||||
|
// load qh
|
||||||
|
vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(qh, 8);
|
||||||
|
vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
|
||||||
|
|
||||||
|
size_t vl = 16;
|
||||||
|
|
||||||
|
// combine both qh_1 and qh_2
|
||||||
|
vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
|
||||||
|
|
||||||
|
vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
|
||||||
|
vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
|
||||||
|
vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
|
||||||
|
vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
|
||||||
|
|
||||||
|
vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
|
||||||
|
vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
|
||||||
|
vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
|
||||||
|
vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
|
||||||
|
|
||||||
|
// load q5
|
||||||
|
vuint8mf2_t q5_x1 = __riscv_vle8_v_u8mf2(q5, vl);
|
||||||
|
vuint8mf2_t q5_x2 = __riscv_vle8_v_u8mf2(q5+16, vl);
|
||||||
|
|
||||||
|
vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
|
||||||
|
vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
|
||||||
|
vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
|
||||||
|
vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
|
||||||
|
|
||||||
|
vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
|
||||||
|
vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
|
||||||
|
vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
|
||||||
|
vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
|
||||||
|
|
||||||
|
// load Q8 and multiply it with Q5
|
||||||
|
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
||||||
|
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
||||||
|
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
||||||
|
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
||||||
|
|
||||||
|
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
||||||
|
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
||||||
|
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
||||||
|
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
||||||
|
|
||||||
|
int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
|
||||||
|
int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
|
||||||
|
int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
|
||||||
|
int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
|
||||||
|
|
||||||
|
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
int8_t aux8[QK_K];
|
int8_t aux8[QK_K];
|
||||||
|
@ -4023,6 +4613,91 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc);
|
*s = hsum_float_8(acc);
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
||||||
|
|
||||||
|
const uint8_t * restrict q6 = x[i].ql;
|
||||||
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
const int8_t * restrict scale = x[i].scales;
|
||||||
|
|
||||||
|
size_t vl;
|
||||||
|
|
||||||
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
|
|
||||||
|
int sum_t = 0;
|
||||||
|
int is = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
|
||||||
|
vl = 32;
|
||||||
|
|
||||||
|
// load qh
|
||||||
|
vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
|
||||||
|
|
||||||
|
// load Q6
|
||||||
|
vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
|
||||||
|
vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
|
||||||
|
|
||||||
|
vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
|
||||||
|
vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
|
||||||
|
vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
|
||||||
|
vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
|
||||||
|
|
||||||
|
vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
|
||||||
|
vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
|
||||||
|
vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
|
||||||
|
vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
|
||||||
|
|
||||||
|
vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
|
||||||
|
vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
|
||||||
|
vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
|
||||||
|
vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
|
||||||
|
|
||||||
|
vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
|
||||||
|
vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
|
||||||
|
vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
|
||||||
|
vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
|
||||||
|
|
||||||
|
// load Q8 and take product
|
||||||
|
vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
|
||||||
|
vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
||||||
|
vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
|
||||||
|
vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
|
||||||
|
|
||||||
|
vl = 16;
|
||||||
|
|
||||||
|
vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
|
||||||
|
vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
|
||||||
|
vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
|
||||||
|
vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
|
||||||
|
vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
|
||||||
|
vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
|
||||||
|
vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
|
||||||
|
vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
|
||||||
|
|
||||||
|
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
|
||||||
|
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
|
||||||
|
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
|
||||||
|
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
|
||||||
|
|
||||||
|
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
|
||||||
|
|
||||||
|
q6 += 64; qh += 32; q8 += 128; is=8;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += d * sum_t;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
int8_t aux8[QK_K];
|
int8_t aux8[QK_K];
|
||||||
|
@ -4276,6 +4951,73 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
*s = hsum_float_8(acc);
|
*s = hsum_float_8(acc);
|
||||||
|
|
||||||
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d_all = (float)x[i].d;
|
||||||
|
|
||||||
|
const uint8_t * restrict q6 = x[i].ql;
|
||||||
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
const int8_t * restrict scale = x[i].scales;
|
||||||
|
|
||||||
|
int32_t isum = 0;
|
||||||
|
|
||||||
|
size_t vl = 16;
|
||||||
|
|
||||||
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
|
|
||||||
|
// load Q6
|
||||||
|
vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
|
||||||
|
vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
|
||||||
|
|
||||||
|
// load qh
|
||||||
|
vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
|
||||||
|
|
||||||
|
vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
||||||
|
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
||||||
|
vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
||||||
|
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
||||||
|
vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
||||||
|
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
||||||
|
vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
||||||
|
|
||||||
|
vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
|
||||||
|
vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
|
||||||
|
vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
|
||||||
|
vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
|
||||||
|
|
||||||
|
vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
|
||||||
|
vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
|
||||||
|
vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
|
||||||
|
vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
|
||||||
|
|
||||||
|
// load Q8 and take product
|
||||||
|
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
||||||
|
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
||||||
|
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
||||||
|
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
||||||
|
|
||||||
|
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
||||||
|
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
||||||
|
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
||||||
|
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
||||||
|
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
|
||||||
|
isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
|
||||||
|
|
||||||
|
sumf += isum * d_all * y[i].d;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
int8_t aux8[QK_K];
|
int8_t aux8[QK_K];
|
||||||
|
|
10
k_quants.h
10
k_quants.h
|
@ -29,7 +29,7 @@
|
||||||
|
|
||||||
// 2-bit quantization
|
// 2-bit quantization
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// 16 blocks of 16 elemenets each
|
// 16 blocks of 16 elements each
|
||||||
// Effectively 2.5625 bits per weight
|
// Effectively 2.5625 bits per weight
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||||
|
@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
||||||
|
|
||||||
// 3-bit quantization
|
// 3-bit quantization
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
// 16 blocks of 16 elemenets each
|
// 16 blocks of 16 elements each
|
||||||
// Effectively 3.4375 bits per weight
|
// Effectively 3.4375 bits per weight
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 4-bit quantization
|
// 4-bit quantization
|
||||||
// 16 blocks of 32 elements each
|
// 8 blocks of 32 elements each
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// Effectively 4.5 bits per weight
|
// Effectively 4.5 bits per weight
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
|
@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 5-bit quantization
|
// 5-bit quantization
|
||||||
// 16 blocks of 32 elements each
|
// 8 blocks of 32 elements each
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// Effectively 5.5 bits per weight
|
// Effectively 5.5 bits per weight
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
|
@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
|
||||||
|
|
||||||
// 6-bit quantization
|
// 6-bit quantization
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
// 16 blocks of 16 elemenets each
|
// 16 blocks of 16 elements each
|
||||||
// Effectively 6.5625 bits per weight
|
// Effectively 6.5625 bits per weight
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||||
|
|
13
llama.h
13
llama.h
|
@ -42,7 +42,7 @@
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 1
|
#define LLAMA_SESSION_VERSION 2
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
|
@ -282,6 +282,9 @@ extern "C" {
|
||||||
LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
|
LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
|
||||||
LLAMA_API int llama_n_embd (const struct llama_model * model);
|
LLAMA_API int llama_n_embd (const struct llama_model * model);
|
||||||
|
|
||||||
|
// Get the model's RoPE frequency scaling factor
|
||||||
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||||
|
|
||||||
// Get a string describing the model type
|
// Get a string describing the model type
|
||||||
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
@ -330,12 +333,16 @@ extern "C" {
|
||||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
||||||
|
|
||||||
// Remove all tokens data of cells in [c0, c1)
|
// Remove all tokens data of cells in [c0, c1)
|
||||||
|
// c0 < 0 : [0, c1]
|
||||||
|
// c1 < 0 : [c0, inf)
|
||||||
LLAMA_API void llama_kv_cache_tokens_rm(
|
LLAMA_API void llama_kv_cache_tokens_rm(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
int32_t c0,
|
int32_t c0,
|
||||||
int32_t c1);
|
int32_t c1);
|
||||||
|
|
||||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
|
// p0 < 0 : [0, p1]
|
||||||
|
// p1 < 0 : [p0, inf)
|
||||||
LLAMA_API void llama_kv_cache_seq_rm(
|
LLAMA_API void llama_kv_cache_seq_rm(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_seq_id seq_id,
|
llama_seq_id seq_id,
|
||||||
|
@ -344,6 +351,8 @@ extern "C" {
|
||||||
|
|
||||||
// Copy all tokens that belong to the specified sequence to another sequence
|
// Copy all tokens that belong to the specified sequence to another sequence
|
||||||
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
||||||
|
// p0 < 0 : [0, p1]
|
||||||
|
// p1 < 0 : [p0, inf)
|
||||||
LLAMA_API void llama_kv_cache_seq_cp(
|
LLAMA_API void llama_kv_cache_seq_cp(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_seq_id seq_id_src,
|
llama_seq_id seq_id_src,
|
||||||
|
@ -358,6 +367,8 @@ extern "C" {
|
||||||
|
|
||||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
// If the KV cache is RoPEd, the KV data is updated accordingly
|
// If the KV cache is RoPEd, the KV data is updated accordingly
|
||||||
|
// p0 < 0 : [0, p1]
|
||||||
|
// p1 < 0 : [p0, inf)
|
||||||
LLAMA_API void llama_kv_cache_seq_shift(
|
LLAMA_API void llama_kv_cache_seq_shift(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_seq_id seq_id,
|
llama_seq_id seq_id,
|
||||||
|
|
BIN
models/ggml-vocab-aquila.gguf
Normal file
BIN
models/ggml-vocab-aquila.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-falcon.gguf
Normal file
BIN
models/ggml-vocab-falcon.gguf
Normal file
Binary file not shown.
49
prompts/LLM-questions.txt
Normal file
49
prompts/LLM-questions.txt
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
In the context of LLMs, what is "Attention"?
|
||||||
|
In the context of LLMs, what is a completion?
|
||||||
|
In the context of LLMs, what is a prompt?
|
||||||
|
In the context of LLMs, what is GELU?
|
||||||
|
In the context of LLMs, what is RELU?
|
||||||
|
In the context of LLMs, what is softmax?
|
||||||
|
In the context of LLMs, what is decoding?
|
||||||
|
In the context of LLMs, what is encoding?
|
||||||
|
In the context of LLMs, what is tokenizing?
|
||||||
|
In the context of LLMs, what is an embedding?
|
||||||
|
In the context of LLMs, what is quantization?
|
||||||
|
In the context of LLMs, what is a tensor?
|
||||||
|
In the context of LLMs, what is a sparse tensor?
|
||||||
|
In the context of LLMs, what is a vector?
|
||||||
|
In the context of LLMs, how is attention implemented?
|
||||||
|
In the context of LLMs, why is attention all you need?
|
||||||
|
In the context of LLMs, what is "RoPe" and what is it used for?
|
||||||
|
In the context of LLMs, what is "LoRA" and what is it used for?
|
||||||
|
In the context of LLMs, what are weights?
|
||||||
|
In the context of LLMs, what are biases?
|
||||||
|
In the context of LLMs, what are checkpoints?
|
||||||
|
In the context of LLMs, what is "perplexity"?
|
||||||
|
In the context of LLMs, what are models?
|
||||||
|
In the context of machine-learning, what is "catastrophic forgetting"?
|
||||||
|
In the context of machine-learning, what is "elastic weight consolidation (EWC)"?
|
||||||
|
In the context of neural nets, what is a hidden layer?
|
||||||
|
In the context of neural nets, what is a convolution?
|
||||||
|
In the context of neural nets, what is dropout?
|
||||||
|
In the context of neural nets, what is cross-entropy?
|
||||||
|
In the context of neural nets, what is over-fitting?
|
||||||
|
In the context of neural nets, what is under-fitting?
|
||||||
|
What is the difference between an interpreted computer language and a compiled computer language?
|
||||||
|
In the context of software development, what is a debugger?
|
||||||
|
When processing using a GPU, what is off-loading?
|
||||||
|
When processing using a GPU, what is a batch?
|
||||||
|
When processing using a GPU, what is a block?
|
||||||
|
When processing using a GPU, what is the difference between a batch and a block?
|
||||||
|
When processing using a GPU, what is a scratch tensor?
|
||||||
|
When processing using a GPU, what is a layer?
|
||||||
|
When processing using a GPU, what is a cache?
|
||||||
|
When processing using a GPU, what is unified memory?
|
||||||
|
When processing using a GPU, what is VRAM?
|
||||||
|
When processing using a GPU, what is a kernel?
|
||||||
|
When processing using a GPU, what is "metal"?
|
||||||
|
In the context of LLMs, what are "Zero-Shot", "One-Shot" and "Few-Shot" learning models?
|
||||||
|
In the context of LLMs, what is the "Transformer-model" architecture?
|
||||||
|
In the context of LLMs, what is "Multi-Head Attention"?
|
||||||
|
In the context of LLMs, what is "Self-Attention"?
|
||||||
|
In the context of transformer-model architectures, how do attention mechanisms use masks?
|
43
prompts/parallel-questions.txt
Normal file
43
prompts/parallel-questions.txt
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
What do you know about Hobbits?
|
||||||
|
What is quantum field theory?
|
||||||
|
Why did the chicken cross the road?
|
||||||
|
Who is the president of the United States?
|
||||||
|
How do I run CMake on MacOS?
|
||||||
|
Do you agree that C++ is a really finicky language compared with Python3?
|
||||||
|
Is it a good idea to invest in technology?
|
||||||
|
Do you like Wagner's Ring?
|
||||||
|
Do you think this file input option is really neat?
|
||||||
|
What should we all do about climate change?
|
||||||
|
Is time-travel possible within the laws of current physics?
|
||||||
|
Is it like anything to be a bat?
|
||||||
|
Once the chicken has crossed the road, does it try to go back?
|
||||||
|
Who is the greatest of all musical composers?
|
||||||
|
What is art?
|
||||||
|
Is there life elsewhere in the universe?
|
||||||
|
What is intelligence?
|
||||||
|
What is the difference between knowledge and intelligence?
|
||||||
|
Will religion ever die?
|
||||||
|
Do we understand ourselves?
|
||||||
|
What is the best way to cook eggs?
|
||||||
|
If you cannot see things, on what basis do you evaluate them?
|
||||||
|
Explain the role of the np junction in photovoltaic cells?
|
||||||
|
Is professional sport a good or bad influence on human behaviour?
|
||||||
|
Is capital punishment immoral?
|
||||||
|
Should we care about other people?
|
||||||
|
Who are you?
|
||||||
|
Which sense would you surrender if you could?
|
||||||
|
Was Henry Ford a hero or a villain?
|
||||||
|
Do we need leaders?
|
||||||
|
What is nucleosynthesis?
|
||||||
|
Who is the greatest scientist of all time?
|
||||||
|
Who first observed what came to be known as the photovoltaic effect?
|
||||||
|
What is nuclear fusion and why does it release energy?
|
||||||
|
Can you know that you exist?
|
||||||
|
What is an exoplanet?
|
||||||
|
Do you like cream?
|
||||||
|
What is the difference?
|
||||||
|
Can I know that I exist while I'm dreaming that I'm Descartes?
|
||||||
|
Who said "I didn't know I thought that until I heard myself saying it"?
|
||||||
|
Does anything really matter?
|
||||||
|
Can you explain the unreasonable effectiveness of mathematics?
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
numpy==1.24
|
numpy==1.24.4
|
||||||
sentencepiece==0.1.98
|
sentencepiece==0.1.98
|
||||||
gguf>=0.1.0
|
gguf>=0.1.0
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
||||||
cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c
|
cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c
|
||||||
|
cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
|
||||||
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
||||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
||||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||||
|
@ -11,6 +12,7 @@ cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
||||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
||||||
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
||||||
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
||||||
|
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
||||||
|
|
||||||
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
|
||||||
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp
|
||||||
|
|
|
@ -7,9 +7,6 @@ endfunction()
|
||||||
|
|
||||||
function(llama_test_executable name source)
|
function(llama_test_executable name source)
|
||||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||||
# add_executable(${TEST_TARGET} ${source})
|
|
||||||
# install(TARGETS ${TEST_TARGET} RUNTIME)
|
|
||||||
# target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
|
||||||
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
|
@ -28,10 +25,12 @@ llama_build_and_test_executable(test-sampling.cpp)
|
||||||
llama_build_executable(test-tokenizer-0-llama.cpp)
|
llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||||
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||||
|
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||||
|
|
|
@ -208,26 +208,6 @@ static struct ggml_tensor * get_random_tensor_i32(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_elements(const char* label, const struct ggml_tensor * t) {
|
|
||||||
if (!t) {
|
|
||||||
printf("%s: %s = null\n", __func__, label);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const int nelements = ggml_nelements(t);
|
|
||||||
printf("%s: %s = [", __func__, label);
|
|
||||||
for (int k = 0; k < nelements; ++k) {
|
|
||||||
if (k > 0) { printf(", "); }
|
|
||||||
printf("%.5f", ggml_get_f32_1d(t, k));
|
|
||||||
}
|
|
||||||
printf("] shape: [");
|
|
||||||
for (int k = 0; k < t->n_dims; ++k) {
|
|
||||||
if (k > 0) { printf(", "); }
|
|
||||||
printf("%d", (int)t->ne[k]);
|
|
||||||
}
|
|
||||||
printf("]\n");
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool check_gradient(
|
static bool check_gradient(
|
||||||
const char * op_name,
|
const char * op_name,
|
||||||
struct ggml_context * ctx0,
|
struct ggml_context * ctx0,
|
||||||
|
|
|
@ -40,27 +40,6 @@ static float frand(void) {
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int irand(int n) {
|
|
||||||
return rand()%n;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void get_random_dims(int64_t * dims, int ndims) {
|
|
||||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
|
||||||
|
|
||||||
for (int i = 0; i < ndims; i++) {
|
|
||||||
dims[i] = 1 + irand(4);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
|
|
||||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
|
||||||
|
|
||||||
for (int i = 0; i < ndims; i++) {
|
|
||||||
dims[i] = min + irand(max-min);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static struct ggml_tensor * get_random_tensor(
|
static struct ggml_tensor * get_random_tensor(
|
||||||
struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
|
struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
|
||||||
) {
|
) {
|
||||||
|
@ -106,14 +85,6 @@ static struct ggml_tensor * get_random_tensor(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static float get_element(const struct ggml_tensor * t, int idx) {
|
|
||||||
return ((float *)t->data)[idx];
|
|
||||||
}
|
|
||||||
|
|
||||||
static void set_element(struct ggml_tensor * t, int idx, float value) {
|
|
||||||
((float *)t->data)[idx] = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ 1024*1024*1024,
|
/* .mem_size = */ 1024*1024*1024,
|
||||||
|
|
|
@ -76,22 +76,21 @@ static void * align_with_offset(void * ptr, int offset) {
|
||||||
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
|
static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
|
||||||
int64_t min_time_us = INT64_MAX;
|
int64_t min_time_us = INT64_MAX;
|
||||||
int64_t total_time_us = 0;
|
int64_t total_time_us = 0;
|
||||||
int64_t min_time_cycles = INT64_MAX;
|
int64_t min_time_cycles = INT64_MAX;
|
||||||
int64_t total_time_cycles = 0;
|
int64_t total_time_cycles = 0;
|
||||||
|
|
||||||
for (int i = 0; i < WARMUP; i++) {
|
for (int i = 0; i < WARMUP; i++) {
|
||||||
function();
|
func();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i < iterations; i++) {
|
for (int i = 0; i < iterations; i++) {
|
||||||
const int64_t start_time = ggml_time_us();
|
const int64_t start_time = ggml_time_us();
|
||||||
const int64_t start_cycles = cpu_cycles();
|
const int64_t start_cycles = cpu_cycles();
|
||||||
|
|
||||||
function();
|
func();
|
||||||
|
|
||||||
const int64_t end_cycles = cpu_cycles();
|
const int64_t end_cycles = cpu_cycles();
|
||||||
const int64_t end_time = ggml_time_us();
|
const int64_t end_time = ggml_time_us();
|
||||||
|
@ -283,7 +282,7 @@ int main(int argc, char * argv[]) {
|
||||||
printf(" quantize_row_q_reference\n");
|
printf(" quantize_row_q_reference\n");
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void) -> float {
|
||||||
qfns.from_float_reference(test_data1, test_q1, size);
|
qfns.from_float_reference(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
|
@ -297,7 +296,7 @@ int main(int argc, char * argv[]) {
|
||||||
printf(" quantize_row_q\n");
|
printf(" quantize_row_q\n");
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void) -> float {
|
||||||
qfns.from_float(test_data1, test_q1, size);
|
qfns.from_float(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
|
@ -312,7 +311,7 @@ int main(int argc, char * argv[]) {
|
||||||
qfns.from_float(test_data1, test_q1, largest);
|
qfns.from_float(test_data1, test_q1, largest);
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void) -> float {
|
||||||
qfns.to_float(test_q1, test_out, size);
|
qfns.to_float(test_q1, test_out, size);
|
||||||
return test_out[0];
|
return test_out[0];
|
||||||
};
|
};
|
||||||
|
@ -326,7 +325,7 @@ int main(int argc, char * argv[]) {
|
||||||
printf(" quantize_row_q_dot\n");
|
printf(" quantize_row_q_dot\n");
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void) -> float {
|
||||||
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
||||||
vdot.from_float(test_data1, test_q1, size);
|
vdot.from_float(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
|
@ -343,7 +342,7 @@ int main(int argc, char * argv[]) {
|
||||||
qfns.from_float(test_data2, test_q2, largest);
|
qfns.from_float(test_data2, test_q2, largest);
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void) -> float {
|
||||||
float result;
|
float result;
|
||||||
qfns.vec_dot(size, &result, test_q1, test_q2);
|
qfns.vec_dot(size, &result, test_q1, test_q2);
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "console.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -85,12 +86,18 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
|
if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
|
||||||
fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
|
fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
// We need this for unicode console support
|
||||||
|
console::init(false, false);
|
||||||
|
atexit([]() { console::cleanup(); });
|
||||||
|
#endif
|
||||||
|
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
|
||||||
for (const auto & test_kv : k_tests()) {
|
for (const auto & test_kv : k_tests()) {
|
||||||
|
|
113
tests/test-tokenizer-1-bpe.cpp
Normal file
113
tests/test-tokenizer-1-bpe.cpp
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
#include "llama.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "unicode.h"
|
||||||
|
#include "console.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <string>
|
||||||
|
#include <codecvt>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <locale>
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 2) {
|
||||||
|
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string fname = argv[1];
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
|
||||||
|
llama_backend_init(false);
|
||||||
|
|
||||||
|
// load the vocab
|
||||||
|
{
|
||||||
|
auto mparams = llama_model_default_params();
|
||||||
|
|
||||||
|
mparams.vocab_only = true;
|
||||||
|
|
||||||
|
model = llama_load_model_from_file(fname.c_str(), mparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto cparams = llama_context_default_params();
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
// We need this for unicode console support
|
||||||
|
console::init(false, false);
|
||||||
|
atexit([]() { console::cleanup(); });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
|
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
|
||||||
|
try {
|
||||||
|
auto cps = codepoints_from_utf8(str);
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||||
|
if (check != str) {
|
||||||
|
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
||||||
|
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (const std::invalid_argument &) {
|
||||||
|
fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
||||||
|
// NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
|
||||||
|
if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
|
||||||
|
std::string str = " " + codepoint_to_utf8(cp);
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||||
|
if (str != check) {
|
||||||
|
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||||
|
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: why doesn't this work for the full range of Unicodes?
|
||||||
|
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||||
|
for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
|
||||||
|
std::string str = codepoint_to_utf8(cp);
|
||||||
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||||
|
if (str != check) {
|
||||||
|
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||||
|
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -1,5 +1,6 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "unicode.h"
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -11,30 +12,6 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
|
||||||
typedef int codepoint;
|
|
||||||
|
|
||||||
static std::string codepoint_to_utf8(codepoint cp) {
|
|
||||||
std::string result;
|
|
||||||
if (0x00 <= cp && cp <= 0x7f) {
|
|
||||||
result.push_back(cp);
|
|
||||||
} else if (0x80 <= cp && cp <= 0x7ff) {
|
|
||||||
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
|
||||||
result.push_back(0x80 | (cp & 0x3f));
|
|
||||||
} else if (0x800 <= cp && cp <= 0xffff) {
|
|
||||||
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
|
||||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
|
||||||
result.push_back(0x80 | (cp & 0x3f));
|
|
||||||
} else if (0x10000 <= cp && cp <= 0x10ffff) {
|
|
||||||
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
|
||||||
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
|
||||||
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
|
||||||
result.push_back(0x80 | (cp & 0x3f));
|
|
||||||
} else {
|
|
||||||
throw std::invalid_argument("invalid codepoint");
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||||
|
@ -95,7 +72,7 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
|
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
||||||
if (cp < 0xd800 || cp > 0xdfff) {
|
if (cp < 0xd800 || cp > 0xdfff) {
|
||||||
std::string str = codepoint_to_utf8(cp);
|
std::string str = codepoint_to_utf8(cp);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
|
@ -107,7 +84,7 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||||
std::string str = codepoint_to_utf8(cp);
|
std::string str = codepoint_to_utf8(cp);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
|
|
462
unicode.h
Normal file
462
unicode.h
Normal file
|
@ -0,0 +1,462 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <vector>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
|
||||||
|
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
|
||||||
|
{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
|
||||||
|
{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
|
||||||
|
{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
|
||||||
|
{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
|
||||||
|
{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
|
||||||
|
{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
|
||||||
|
{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::vector<std::pair<uint32_t, uint32_t>> letter_ranges = {
|
||||||
|
{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
|
||||||
|
{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
|
||||||
|
{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
|
||||||
|
{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
|
||||||
|
{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
|
||||||
|
{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
|
||||||
|
{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
|
||||||
|
{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
|
||||||
|
{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
|
||||||
|
{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
|
||||||
|
{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
|
||||||
|
{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
|
||||||
|
{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
|
||||||
|
{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
|
||||||
|
{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
|
||||||
|
{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
|
||||||
|
{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
|
||||||
|
{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
|
||||||
|
{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
|
||||||
|
{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
|
||||||
|
{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
|
||||||
|
{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
|
||||||
|
{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
|
||||||
|
{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
|
||||||
|
{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
|
||||||
|
{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
|
||||||
|
{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
|
||||||
|
{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
|
||||||
|
{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
|
||||||
|
{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
|
||||||
|
{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
|
||||||
|
{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
|
||||||
|
{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
|
||||||
|
{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
|
||||||
|
{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
|
||||||
|
{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
|
||||||
|
{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
|
||||||
|
{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
|
||||||
|
{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
|
||||||
|
{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
|
||||||
|
{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
|
||||||
|
{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
|
||||||
|
{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
|
||||||
|
{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
|
||||||
|
{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
|
||||||
|
{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
|
||||||
|
{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
|
||||||
|
{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
|
||||||
|
{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
|
||||||
|
{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
|
||||||
|
{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
|
||||||
|
{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
|
||||||
|
{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
|
||||||
|
{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
|
||||||
|
{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
|
||||||
|
{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
|
||||||
|
{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
|
||||||
|
{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::vector<std::pair<uint32_t, uint32_t>> whitespace_ranges = {
|
||||||
|
{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::vector<std::pair<uint32_t, uint32_t>> accent_mark_ranges = {
|
||||||
|
{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
|
||||||
|
{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
|
||||||
|
{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
|
||||||
|
{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
|
||||||
|
{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
|
||||||
|
{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
|
||||||
|
{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
|
||||||
|
{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
|
||||||
|
{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
|
||||||
|
{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
|
||||||
|
{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
|
||||||
|
{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
|
||||||
|
{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
|
||||||
|
{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
|
||||||
|
{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
|
||||||
|
{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
|
||||||
|
{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
|
||||||
|
{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
|
||||||
|
{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
|
||||||
|
{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
|
||||||
|
{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
|
||||||
|
{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
|
||||||
|
{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
|
||||||
|
{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
|
||||||
|
{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
|
||||||
|
{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
|
||||||
|
{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::vector<std::pair<uint32_t, uint32_t>> punctuation_ranges = {
|
||||||
|
{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
|
||||||
|
{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
|
||||||
|
{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
|
||||||
|
{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
|
||||||
|
{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
|
||||||
|
{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
|
||||||
|
{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
|
||||||
|
{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
|
||||||
|
{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
|
||||||
|
{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
|
||||||
|
{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
|
||||||
|
{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
|
||||||
|
{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
|
||||||
|
{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
|
||||||
|
{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
|
||||||
|
{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
|
||||||
|
{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::vector<std::pair<uint32_t, uint32_t>> symbol_ranges = {
|
||||||
|
{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
|
||||||
|
{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
|
||||||
|
{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
|
||||||
|
{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
|
||||||
|
{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
|
||||||
|
{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
|
||||||
|
{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
|
||||||
|
{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
|
||||||
|
{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
|
||||||
|
{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
|
||||||
|
{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
|
||||||
|
{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
|
||||||
|
{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
|
||||||
|
{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
|
||||||
|
{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
|
||||||
|
{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
|
||||||
|
{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
|
||||||
|
{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
|
||||||
|
{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
|
||||||
|
{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
|
||||||
|
{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::vector<std::pair<uint32_t, uint32_t>> control_ranges = {
|
||||||
|
{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
|
||||||
|
{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
|
||||||
|
{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
|
||||||
|
{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
|
||||||
|
{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
|
||||||
|
{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
|
||||||
|
{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
|
||||||
|
{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
|
||||||
|
{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
|
||||||
|
{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
|
||||||
|
{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
|
||||||
|
{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
|
||||||
|
{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
|
||||||
|
{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
|
||||||
|
{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
|
||||||
|
{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
|
||||||
|
{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
|
||||||
|
{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
|
||||||
|
{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
|
||||||
|
{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
|
||||||
|
{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
|
||||||
|
{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
|
||||||
|
{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
|
||||||
|
{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
|
||||||
|
{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
|
||||||
|
{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
|
||||||
|
{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
|
||||||
|
{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
|
||||||
|
{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
|
||||||
|
{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
|
||||||
|
{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
|
||||||
|
{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
|
||||||
|
{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
|
||||||
|
{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
|
||||||
|
{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
|
||||||
|
{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
|
||||||
|
{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
|
||||||
|
{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
|
||||||
|
{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
|
||||||
|
{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
|
||||||
|
{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
|
||||||
|
{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
|
||||||
|
{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
|
||||||
|
{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
|
||||||
|
{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
|
||||||
|
{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
|
||||||
|
{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
|
||||||
|
{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
|
||||||
|
{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
|
||||||
|
{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
|
||||||
|
{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
|
||||||
|
{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
|
||||||
|
{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
|
||||||
|
{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
|
||||||
|
{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
|
||||||
|
{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
|
||||||
|
{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
|
||||||
|
{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
|
||||||
|
{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
|
||||||
|
{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
|
||||||
|
{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
|
||||||
|
{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
|
||||||
|
{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
|
||||||
|
{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::string codepoint_to_utf8(uint32_t cp) {
|
||||||
|
std::string result;
|
||||||
|
if (/* 0x00 <= cp && */ cp <= 0x7f) {
|
||||||
|
result.push_back(cp);
|
||||||
|
}
|
||||||
|
else if (0x80 <= cp && cp <= 0x7ff) {
|
||||||
|
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
}
|
||||||
|
else if (0x800 <= cp && cp <= 0xffff) {
|
||||||
|
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
|
||||||
|
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
}
|
||||||
|
else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||||
|
result.push_back(0xf0 | ((cp >> 18) & 0x07));
|
||||||
|
result.push_back(0x80 | ((cp >> 12) & 0x3f));
|
||||||
|
result.push_back(0x80 | ((cp >> 6) & 0x3f));
|
||||||
|
result.push_back(0x80 | (cp & 0x3f));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw std::invalid_argument("invalid codepoint");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string codepoints_to_utf8(const std::vector<uint32_t> & cps) {
|
||||||
|
std::string result;
|
||||||
|
for (size_t i = 0; i < cps.size(); ++i) {
|
||||||
|
result.append(codepoint_to_utf8(cps[i]));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
|
||||||
|
assert(offset < utf8.size());
|
||||||
|
if (!(utf8[offset + 0] & 0x80)) {
|
||||||
|
auto result = utf8[offset + 0];
|
||||||
|
offset += 1;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
else if (!(utf8[offset + 0] & 0x40)) {
|
||||||
|
throw std::invalid_argument("invalid character");
|
||||||
|
}
|
||||||
|
else if (!(utf8[offset + 0] & 0x20)) {
|
||||||
|
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
|
||||||
|
throw std::invalid_argument("invalid character");
|
||||||
|
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
||||||
|
offset += 2;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
else if (!(utf8[offset + 0] & 0x10)) {
|
||||||
|
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
|
||||||
|
throw std::invalid_argument("invalid character");
|
||||||
|
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
||||||
|
offset += 3;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
else if (!(utf8[offset + 0] & 0x08)) {
|
||||||
|
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
|
||||||
|
throw std::invalid_argument("invalid character");
|
||||||
|
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
||||||
|
offset += 4;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
throw std::invalid_argument("invalid string");
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<uint32_t> codepoints_from_utf8(const std::string & utf8) {
|
||||||
|
std::vector<uint32_t> result;
|
||||||
|
size_t offset = 0;
|
||||||
|
while (offset < utf8.size()) {
|
||||||
|
result.push_back(codepoint_from_utf8(utf8, offset));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<uint16_t> codepoint_to_utf16(uint32_t cp) {
|
||||||
|
std::vector<uint16_t> result;
|
||||||
|
if (/* 0x0000 <= cp && */ cp <= 0xffff) {
|
||||||
|
result.emplace_back(cp);
|
||||||
|
}
|
||||||
|
else if (0x10000 <= cp && cp <= 0x10ffff) {
|
||||||
|
result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
|
||||||
|
result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw std::invalid_argument("invalid codepoint");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t> & cps) {
|
||||||
|
std::vector<uint16_t> result;
|
||||||
|
for (size_t i = 0; i < cps.size(); ++i) {
|
||||||
|
auto temp = codepoint_to_utf16(cps[i]);
|
||||||
|
result.insert(result.end(), temp.begin(), temp.end());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
|
||||||
|
assert(offset < utf16.size());
|
||||||
|
if (((utf16[0] >> 10) << 10) != 0xd800) {
|
||||||
|
auto result = utf16[offset + 0];
|
||||||
|
offset += 1;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
|
||||||
|
throw std::invalid_argument("invalid character");
|
||||||
|
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
||||||
|
offset += 2;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
throw std::invalid_argument("invalid string");
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
||||||
|
std::vector<uint32_t> result;
|
||||||
|
size_t offset = 0;
|
||||||
|
while (offset < utf16.size())
|
||||||
|
result.push_back(codepoint_from_utf16(utf16, offset));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
||||||
|
#define CODEPOINT_TYPE_DIGIT 1
|
||||||
|
#define CODEPOINT_TYPE_LETTER 2
|
||||||
|
#define CODEPOINT_TYPE_WHITESPACE 3
|
||||||
|
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
||||||
|
#define CODEPOINT_TYPE_PUNCTUATION 5
|
||||||
|
#define CODEPOINT_TYPE_SYMBOL 6
|
||||||
|
#define CODEPOINT_TYPE_CONTROL 7
|
||||||
|
|
||||||
|
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
||||||
|
std::unordered_map<uint32_t, int> codepoint_types;
|
||||||
|
for (auto p : digit_ranges) {
|
||||||
|
for(auto i = p.first; i <= p.second; ++ i)
|
||||||
|
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
||||||
|
}
|
||||||
|
for(auto p : letter_ranges) {
|
||||||
|
for(auto i = p.first; i <= p.second; ++ i)
|
||||||
|
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
||||||
|
}
|
||||||
|
for(auto p : whitespace_ranges) {
|
||||||
|
for(auto i = p.first; i <= p.second; ++ i)
|
||||||
|
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
||||||
|
}
|
||||||
|
for(auto p : accent_mark_ranges) {
|
||||||
|
for(auto i = p.first; i <= p.second; ++ i)
|
||||||
|
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
||||||
|
}
|
||||||
|
for(auto p : punctuation_ranges) {
|
||||||
|
for(auto i = p.first; i <= p.second; ++ i)
|
||||||
|
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
||||||
|
}
|
||||||
|
for (auto p : symbol_ranges) {
|
||||||
|
for (auto i = p.first; i <= p.second; ++i)
|
||||||
|
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
||||||
|
}
|
||||||
|
for(auto p : control_ranges) {
|
||||||
|
for(auto i = p.first; i <= p.second; ++ i)
|
||||||
|
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
||||||
|
}
|
||||||
|
return codepoint_types;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int codepoint_type(uint32_t cp) {
|
||||||
|
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
||||||
|
return codepoint_types[cp];
|
||||||
|
}
|
||||||
|
|
||||||
|
static int codepoint_type(const std::string & utf8) {
|
||||||
|
if (utf8.length() == 0)
|
||||||
|
return CODEPOINT_TYPE_UNIDENTIFIED;
|
||||||
|
size_t offset = 0;
|
||||||
|
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map_bpe() {
|
||||||
|
std::unordered_map<uint8_t, std::string> map;
|
||||||
|
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
||||||
|
assert(0 <= ch && ch < 256);
|
||||||
|
map[ch] = codepoint_to_utf8(ch);
|
||||||
|
}
|
||||||
|
for (int ch = u'¡'; ch <= u'¬'; ++ch) {
|
||||||
|
assert(0 <= ch && ch < 256);
|
||||||
|
map[ch] = codepoint_to_utf8(ch);
|
||||||
|
}
|
||||||
|
for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
|
||||||
|
assert(0 <= ch && ch < 256);
|
||||||
|
map[ch] = codepoint_to_utf8(ch);
|
||||||
|
}
|
||||||
|
auto n = 0;
|
||||||
|
for (int ch = 0; ch < 256; ++ch) {
|
||||||
|
if (map.find(ch) == map.end()) {
|
||||||
|
map[ch] = codepoint_to_utf8(256 + n);
|
||||||
|
++n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string bytes_to_unicode_bpe(uint8_t byte) {
|
||||||
|
static std::unordered_map<uint8_t, std::string> map = bytes_to_unicode_map_bpe();
|
||||||
|
return map.at(byte);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::unordered_map<std::string, uint8_t> unicode_to_bytes_map_bpe() {
|
||||||
|
std::unordered_map<std::string, uint8_t> map;
|
||||||
|
for (int ch = u'!'; ch <= u'~'; ++ch) {
|
||||||
|
assert(0 <= ch && ch < 256);
|
||||||
|
map[codepoint_to_utf8(ch)] = ch;
|
||||||
|
}
|
||||||
|
for (int ch = u'¡'; ch <= u'¬'; ++ch) {
|
||||||
|
assert(0 <= ch && ch < 256);
|
||||||
|
map[codepoint_to_utf8(ch)] = ch;
|
||||||
|
}
|
||||||
|
for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
|
||||||
|
assert(0 <= ch && ch < 256);
|
||||||
|
map[codepoint_to_utf8(ch)] = ch;
|
||||||
|
}
|
||||||
|
auto n = 0;
|
||||||
|
for (int ch = 0; ch < 256; ++ch) {
|
||||||
|
if (map.find(codepoint_to_utf8(ch)) == map.end()) {
|
||||||
|
map[codepoint_to_utf8(256 + n)] = ch;
|
||||||
|
++n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint8_t unicode_to_bytes_bpe(const std::string & utf8) {
|
||||||
|
static std::unordered_map<std::string, uint8_t> map = unicode_to_bytes_map_bpe();
|
||||||
|
return map.at(utf8);
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue