Merge branch 'ggerganov-master'
This commit is contained in:
commit
a2b46fbda6
60 changed files with 25905 additions and 23889 deletions
|
@ -28,4 +28,5 @@ indent_size = 2
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
[examples/cvector-generator/*.txt]
|
[examples/cvector-generator/*.txt]
|
||||||
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
1
.github/labeler.yml
vendored
1
.github/labeler.yml
vendored
|
@ -42,7 +42,6 @@ build:
|
||||||
- cmake/**
|
- cmake/**
|
||||||
- CMakeLists.txt
|
- CMakeLists.txt
|
||||||
- CMakePresets.json
|
- CMakePresets.json
|
||||||
- codecov.yml
|
|
||||||
examples:
|
examples:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file: examples/**
|
- any-glob-to-any-file: examples/**
|
||||||
|
|
113
.gitignore
vendored
113
.gitignore
vendored
|
@ -1,90 +1,123 @@
|
||||||
*.o
|
# Extensions
|
||||||
|
|
||||||
*.a
|
*.a
|
||||||
*.so
|
*.bat
|
||||||
|
*.bin
|
||||||
|
*.dll
|
||||||
|
*.dot
|
||||||
|
*.etag
|
||||||
|
*.exe
|
||||||
|
*.gcda
|
||||||
|
*.gcno
|
||||||
|
*.gcov
|
||||||
*.gguf
|
*.gguf
|
||||||
*.gguf.json
|
*.gguf.json
|
||||||
*.bin
|
|
||||||
*.exe
|
|
||||||
*.dll
|
|
||||||
*.log
|
|
||||||
*.gcov
|
|
||||||
*.gcno
|
|
||||||
*.gcda
|
|
||||||
*.dot
|
|
||||||
*.bat
|
|
||||||
*.tmp
|
|
||||||
*.metallib
|
|
||||||
*.etag
|
|
||||||
*.lastModified
|
*.lastModified
|
||||||
.DS_Store
|
*.log
|
||||||
.build/
|
*.metallib
|
||||||
|
*.o
|
||||||
|
*.so
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# IDE / OS
|
||||||
|
|
||||||
.cache/
|
.cache/
|
||||||
.ccls-cache/
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
|
.DS_Store
|
||||||
.envrc
|
.envrc
|
||||||
|
.idea/
|
||||||
.swiftpm
|
.swiftpm
|
||||||
.venv
|
|
||||||
.clang-tidy
|
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
nppBackup
|
||||||
|
|
||||||
ggml-metal-embed.metal
|
|
||||||
|
|
||||||
lcov-report/
|
# Coverage
|
||||||
|
|
||||||
gcovr-report/
|
gcovr-report/
|
||||||
|
lcov-report/
|
||||||
|
|
||||||
|
# Build Artifacts
|
||||||
|
|
||||||
tags
|
tags
|
||||||
|
.build/
|
||||||
build*
|
build*
|
||||||
|
!build-info.cmake
|
||||||
|
!build-info.cpp.in
|
||||||
|
!build-info.sh
|
||||||
!build.zig
|
!build.zig
|
||||||
cmake-build-*
|
/libllama.so
|
||||||
|
/llama-*
|
||||||
android-ndk-*
|
android-ndk-*
|
||||||
|
arm_neon.h
|
||||||
|
cmake-build-*
|
||||||
|
CMakeSettings.json
|
||||||
|
compile_commands.json
|
||||||
|
ggml-metal-embed.metal
|
||||||
|
llama-batched-swift
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
# CI
|
||||||
|
|
||||||
|
!.github/workflows/*.yml
|
||||||
|
|
||||||
|
# Models
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
models-mnt
|
models-mnt
|
||||||
|
!models/.editorconfig
|
||||||
|
!models/ggml-vocab-*.gguf*
|
||||||
|
|
||||||
/Pipfile
|
# Zig
|
||||||
/libllama.so
|
|
||||||
/llama-*
|
|
||||||
llama-batched-swift
|
|
||||||
/common/build-info.cpp
|
|
||||||
arm_neon.h
|
|
||||||
compile_commands.json
|
|
||||||
CMakeSettings.json
|
|
||||||
|
|
||||||
__pycache__
|
|
||||||
dist
|
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
|
||||||
ppl-*.txt
|
ppl-*.txt
|
||||||
qnt-*.txt
|
qnt-*.txt
|
||||||
perf-*.txt
|
perf-*.txt
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
|
examples/server/*.css.hpp
|
||||||
examples/server/*.html.hpp
|
examples/server/*.html.hpp
|
||||||
examples/server/*.js.hpp
|
examples/server/*.js.hpp
|
||||||
examples/server/*.mjs.hpp
|
examples/server/*.mjs.hpp
|
||||||
examples/server/*.css.hpp
|
!build_64.sh
|
||||||
|
!examples/*.bat
|
||||||
|
!examples/*/*.kts
|
||||||
|
!examples/*/*/*.kts
|
||||||
|
!examples/sycl/*.bat
|
||||||
|
!examples/sycl/*.sh
|
||||||
|
|
||||||
|
# Python
|
||||||
|
|
||||||
|
__pycache__
|
||||||
|
.venv
|
||||||
|
/Pipfile
|
||||||
|
dist
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
nppBackup
|
|
||||||
|
|
||||||
# Test binaries
|
# Test binaries
|
||||||
/tests/test-grammar-parser
|
/tests/test-backend-ops
|
||||||
/tests/test-llama-grammar
|
|
||||||
/tests/test-double-float
|
/tests/test-double-float
|
||||||
/tests/test-grad0
|
/tests/test-grad0
|
||||||
|
/tests/test-grammar-parser
|
||||||
|
/tests/test-llama-grammar
|
||||||
/tests/test-opt
|
/tests/test-opt
|
||||||
/tests/test-quantize-fns
|
/tests/test-quantize-fns
|
||||||
/tests/test-quantize-perf
|
/tests/test-quantize-perf
|
||||||
|
/tests/test-rope
|
||||||
/tests/test-sampling
|
/tests/test-sampling
|
||||||
/tests/test-tokenizer-0
|
/tests/test-tokenizer-0
|
||||||
/tests/test-tokenizer-1-spm
|
|
||||||
/tests/test-tokenizer-1-bpe
|
/tests/test-tokenizer-1-bpe
|
||||||
/tests/test-rope
|
/tests/test-tokenizer-1-spm
|
||||||
/tests/test-backend-ops
|
|
||||||
|
# Scripts
|
||||||
|
!/scripts/install-oneapi.bat
|
||||||
|
|
|
@ -102,7 +102,8 @@ option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM"
|
||||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
option(LLAMA_CUDA_FORCE_MMQ "llama: always use mmq kernels instead of cuBLAS" OFF)
|
||||||
|
option(LLAMA_CUDA_FORCE_CUBLAS "llama: always use cuBLAS instead of mmq kernels" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||||
|
@ -144,9 +145,6 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
|
||||||
option(LLAMA_LASX "llama: enable lasx" ON)
|
option(LLAMA_LASX "llama: enable lasx" ON)
|
||||||
option(LLAMA_LSX "llama: enable lsx" ON)
|
option(LLAMA_LSX "llama: enable lsx" ON)
|
||||||
|
|
||||||
# add perf arguments
|
|
||||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
|
|
||||||
|
@ -419,13 +417,14 @@ if (LLAMA_CUDA)
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
# 52 == lowest CUDA 12 standard
|
# 52 == lowest CUDA 12 standard
|
||||||
# 60 == f16 CUDA intrinsics
|
# 60 == FP16 CUDA intrinsics
|
||||||
# 61 == integer CUDA intrinsics
|
# 61 == integer CUDA intrinsics
|
||||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
# 70 == FP16 tensor cores
|
||||||
|
# 75 == int8 tensor cores
|
||||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
|
||||||
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -450,6 +449,9 @@ if (LLAMA_CUDA)
|
||||||
if (LLAMA_CUDA_FORCE_MMQ)
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_FORCE_CUBLAS)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||||
|
endif()
|
||||||
if (LLAMA_CUDA_NO_VMM)
|
if (LLAMA_CUDA_NO_VMM)
|
||||||
add_compile_definitions(GGML_CUDA_NO_VMM)
|
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||||
endif()
|
endif()
|
||||||
|
@ -665,6 +667,7 @@ if (LLAMA_SYCL)
|
||||||
#todo: AOT
|
#todo: AOT
|
||||||
|
|
||||||
find_package(IntelSYCL REQUIRED)
|
find_package(IntelSYCL REQUIRED)
|
||||||
|
find_package(MKL REQUIRED)
|
||||||
|
|
||||||
message(STATUS "SYCL found")
|
message(STATUS "SYCL found")
|
||||||
|
|
||||||
|
@ -679,11 +682,9 @@ if (LLAMA_SYCL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_compile_options(-I./) #include DPCT
|
add_compile_options(-I./) #include DPCT
|
||||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
|
||||||
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||||
endif()
|
endif()
|
||||||
|
@ -693,8 +694,10 @@ if (LLAMA_SYCL)
|
||||||
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||||
else()
|
else()
|
||||||
|
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||||
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||||
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
|
@ -869,10 +872,6 @@ if (LLAMA_CPU_HBM)
|
||||||
target_link_libraries(ggml PUBLIC memkind)
|
target_link_libraries(ggml PUBLIC memkind)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_PERF)
|
|
||||||
add_compile_definitions(GGML_PERF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
function(get_flags CCID CCVER)
|
function(get_flags CCID CCVER)
|
||||||
set(C_FLAGS "")
|
set(C_FLAGS "")
|
||||||
set(CXX_FLAGS "")
|
set(CXX_FLAGS "")
|
||||||
|
|
|
@ -11,9 +11,21 @@
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "sycl-base",
|
||||||
|
"hidden": true,
|
||||||
|
"generator": "Ninja",
|
||||||
|
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
|
"CMAKE_CXX_COMPILER": "icx",
|
||||||
|
"LLAMA_SYCL": "ON",
|
||||||
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
|
}
|
||||||
|
},
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -35,15 +47,18 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -344,9 +344,6 @@ ifdef LLAMA_GPROF
|
||||||
MK_CFLAGS += -pg
|
MK_CFLAGS += -pg
|
||||||
MK_CXXFLAGS += -pg
|
MK_CXXFLAGS += -pg
|
||||||
endif
|
endif
|
||||||
ifdef LLAMA_PERF
|
|
||||||
MK_CPPFLAGS += -DGGML_PERF
|
|
||||||
endif
|
|
||||||
|
|
||||||
# Architecture specific
|
# Architecture specific
|
||||||
# TODO: probably these flags need to be tweaked on some architectures
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
|
@ -540,6 +537,9 @@ endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
ifdef LLAMA_CUDA_FORCE_MMQ
|
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||||
endif # LLAMA_CUDA_FORCE_MMQ
|
endif # LLAMA_CUDA_FORCE_MMQ
|
||||||
|
ifdef LLAMA_CUDA_FORCE_CUBLAS
|
||||||
|
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
|
||||||
|
endif # LLAMA_CUDA_FORCE_CUBLAS
|
||||||
ifdef LLAMA_CUDA_DMMV_X
|
ifdef LLAMA_CUDA_DMMV_X
|
||||||
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
else
|
else
|
||||||
|
@ -1051,7 +1051,7 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -410,15 +410,9 @@ Output (example):
|
||||||
|
|
||||||
4. Install build tools
|
4. Install build tools
|
||||||
|
|
||||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||||
|
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||||
|
|
||||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
|
||||||
|
|
||||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
|
||||||
|
|
||||||
- Extract `w64devkit` on your pc.
|
|
||||||
|
|
||||||
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
|
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build llama.cpp
|
||||||
|
|
||||||
|
@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
# Option 2: Or FP16
|
# Option 2: Or FP16
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
cmake --build build --config Release -j
|
cmake --build build --config Release -j
|
||||||
```
|
```
|
||||||
|
@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
||||||
.\examples\sycl\win-build-sycl.bat
|
.\examples\sycl\win-build-sycl.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Or, use CMake presets to build:
|
||||||
|
```sh
|
||||||
|
cmake --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake --preset x64-windows-sycl-debug
|
||||||
|
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||||
|
```
|
||||||
|
|
||||||
|
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
|
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
|
|
|
@ -510,8 +510,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
|
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). Speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
||||||
|
| LLAMA_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
|
|
14
codecov.yml
14
codecov.yml
|
@ -1,14 +0,0 @@
|
||||||
comment: off
|
|
||||||
|
|
||||||
coverage:
|
|
||||||
status:
|
|
||||||
project:
|
|
||||||
default:
|
|
||||||
target: auto
|
|
||||||
threshold: 0
|
|
||||||
base: auto
|
|
||||||
patch:
|
|
||||||
default:
|
|
||||||
target: auto
|
|
||||||
threshold: 0
|
|
||||||
base: auto
|
|
File diff suppressed because it is too large
Load diff
|
@ -152,7 +152,6 @@ struct gpt_params {
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
|
||||||
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
|
@ -179,6 +178,12 @@ struct gpt_params {
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// embedding
|
||||||
|
bool embedding = false; // get only sentence embedding
|
||||||
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
@ -377,7 +382,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
|
|
|
@ -214,7 +214,7 @@ src_func = f"""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||||
convert_py = convert_py_pth.read_text()
|
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||||
convert_py = re.sub(
|
convert_py = re.sub(
|
||||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||||
lambda m: m.group(1) + src_func + m.group(3),
|
lambda m: m.group(1) + src_func + m.group(3),
|
||||||
|
@ -222,7 +222,7 @@ convert_py = re.sub(
|
||||||
flags=re.DOTALL | re.MULTILINE,
|
flags=re.DOTALL | re.MULTILINE,
|
||||||
)
|
)
|
||||||
|
|
||||||
convert_py_pth.write_text(convert_py)
|
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
||||||
|
|
||||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,8 @@ class Model:
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
|
||||||
|
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
self.dir_model = dir_model
|
self.dir_model = dir_model
|
||||||
|
@ -80,7 +81,7 @@ class Model:
|
||||||
if not self.is_safetensors:
|
if not self.is_safetensors:
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
|
@ -96,7 +97,8 @@ class Model:
|
||||||
ftype_lw: str = ftype_up.lower()
|
ftype_lw: str = ftype_up.lower()
|
||||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||||
|
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
|
@ -332,6 +334,8 @@ class Model:
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
|
||||||
def write_vocab(self):
|
def write_vocab(self):
|
||||||
|
if len(self.gguf_writer.tensors) != 1:
|
||||||
|
raise ValueError('Splitting the vocabulary is not supported')
|
||||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||||
self.gguf_writer.write_kv_data_to_file()
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
@ -967,7 +971,11 @@ class XverseModel(Model):
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
|
||||||
|
# because vocab_size is the count of items, and indexes start at 0.
|
||||||
|
max_vocab_index = max(tokenizer.get_vocab().values())
|
||||||
|
if max_vocab_index >= vocab_size:
|
||||||
|
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
||||||
|
|
||||||
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
@ -1400,6 +1408,48 @@ class LlamaModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("BitnetForCausalLM")
|
||||||
|
class BitnetModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.BITNET
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
|
def weight_quant(self, weight):
|
||||||
|
dtype = weight.dtype
|
||||||
|
weight = weight.float()
|
||||||
|
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
||||||
|
weight = (weight * s).round().clamp(-1, 1) / s
|
||||||
|
scale = weight.abs().max().unsqueeze(0)
|
||||||
|
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
||||||
|
weight = torch.sign(weight).type(dtype)
|
||||||
|
return weight.type(dtype), scale.type(torch.float32)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
||||||
|
gguf.MODEL_TENSOR.ATTN_Q,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_K,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_V,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_OUT,
|
||||||
|
gguf.MODEL_TENSOR.FFN_UP,
|
||||||
|
gguf.MODEL_TENSOR.FFN_DOWN,
|
||||||
|
gguf.MODEL_TENSOR.FFN_GATE,
|
||||||
|
]):
|
||||||
|
# transform weight into 1/0/-1 (in fp32)
|
||||||
|
weight_torch, scale_torch = self.weight_quant(data_torch)
|
||||||
|
yield (new_name, weight_torch)
|
||||||
|
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
||||||
|
else:
|
||||||
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GrokForCausalLM")
|
@Model.register("GrokForCausalLM")
|
||||||
class GrokModel(Model):
|
class GrokModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GROK
|
model_arch = gguf.MODEL_ARCH.GROK
|
||||||
|
@ -2725,6 +2775,124 @@ class DeepseekV2Model(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("T5ForConditionalGeneration")
|
||||||
|
@Model.register("T5WithLMHeadModel")
|
||||||
|
class T5Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.T5
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# to avoid TypeError: Descriptors cannot be created directly
|
||||||
|
# exception when importing sentencepiece_model_pb2
|
||||||
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
from sentencepiece import sentencepiece_model_pb2 as model
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'spiece.model'
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||||
|
|
||||||
|
sentencepiece_model = model.ModelProto()
|
||||||
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||||
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||||
|
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||||
|
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||||
|
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor()
|
||||||
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
piece = tokenizer.IdToPiece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.IsUnknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.IsControl(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.IsUnused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.IsByte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
added_tokens_json = json.load(f)
|
||||||
|
for key in added_tokens_json:
|
||||||
|
token_id = added_tokens_json[key]
|
||||||
|
if (token_id >= vocab_size):
|
||||||
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens[token_id] = key.encode("utf-8")
|
||||||
|
scores[token_id] = -1000.0
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
|
||||||
|
if vocab_size > len(tokens):
|
||||||
|
pad_count = vocab_size - len(tokens)
|
||||||
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||||
|
for i in range(1, pad_count + 1):
|
||||||
|
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("t5")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||||
|
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
||||||
|
if precompiled_charsmap:
|
||||||
|
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name("T5")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
||||||
|
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
|
||||||
|
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
|
||||||
|
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
|
||||||
|
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
|
||||||
|
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -2810,10 +2978,44 @@ def parse_args() -> argparse.Namespace:
|
||||||
"--verbose", action="store_true",
|
"--verbose", action="store_true",
|
||||||
help="increase output verbosity",
|
help="increase output verbosity",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-tensors", type=int, default=0,
|
||||||
|
help="max tensors in each split",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-size", type=str, default="0",
|
||||||
|
help="max size per split N(M|G)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run", action="store_true",
|
||||||
|
help="only print out a split plan and exit, without writing any new files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-tensor-first-split", action="store_true",
|
||||||
|
help="do not add tensors to the first split (disabled by default)"
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def split_str_to_n_bytes(split_str: str) -> int:
|
||||||
|
if split_str.endswith("K"):
|
||||||
|
n = int(split_str[:-1]) * 1000
|
||||||
|
elif split_str.endswith("M"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000
|
||||||
|
elif split_str.endswith("G"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
||||||
|
elif split_str.isnumeric():
|
||||||
|
n = int(split_str)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
||||||
|
|
||||||
|
if n < 0:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
||||||
|
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
|
@ -2846,6 +3048,10 @@ def main() -> None:
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
|
||||||
|
logger.error("Error: Cannot use temp file when splitting")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
|
@ -2863,7 +3069,10 @@ def main() -> None:
|
||||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
|
||||||
|
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
|
||||||
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||||
|
small_first_shard=args.no_tensor_first_split)
|
||||||
|
|
||||||
logger.info("Set model parameters")
|
logger.info("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
@ -2874,13 +3083,13 @@ def main() -> None:
|
||||||
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
logger.info("Exporting model vocab...")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
|
logger.info("Model vocab successfully exported.")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Exporting model to '{model_instance.fname_out}'")
|
logger.info("Exporting model...")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
|
logger.info("Model successfully exported.")
|
||||||
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -17,7 +17,7 @@ Related PRs:
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
||||||
|
|
||||||
# With advanced options
|
# With advanced options
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
|
||||||
|
|
||||||
# To see help message
|
# To see help message
|
||||||
./cvector-generator -h
|
./cvector-generator -h
|
||||||
|
|
|
@ -40,7 +40,7 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
||||||
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
|
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -377,8 +377,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
// create templated prompts
|
// create templated prompts
|
||||||
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
||||||
auto format_template = [](std::string persona, std::string suffix) {
|
auto format_template = [](std::string persona, std::string suffix) {
|
||||||
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
|
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
|
||||||
return persona + " " + suffix;
|
return persona + suffix;
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
||||||
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
[INST] Act like a person who is extremely sad. [/INST]
|
[INST] Act like a person who is extremely sad. [/INST]
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
[INST] Act like a person who is extremely happy. [/INST]
|
[INST] Act like a person who is extremely happy. [/INST]
|
||||||
|
|
|
@ -19,3 +19,43 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
||||||
|
## extra parameters
|
||||||
|
### --embd-normalize $integer$
|
||||||
|
| $integer$ | description | formula |
|
||||||
|
|-----------|---------------------|---------|
|
||||||
|
| $-1$ | none |
|
||||||
|
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
|
||||||
|
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
|
||||||
|
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
|
||||||
|
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
|
||||||
|
|
||||||
|
### --embd-output-format $'string'$
|
||||||
|
| $'string'$ | description | |
|
||||||
|
|------------|------------------------------|--|
|
||||||
|
| '' | same as before | (default)
|
||||||
|
| 'array' | single embeddings | $[[x_1,...,x_n]]$
|
||||||
|
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
|
||||||
|
| 'json' | openai style |
|
||||||
|
| 'json+' | add cosine similarity matrix |
|
||||||
|
|
||||||
|
### --embd-separator $"string"$
|
||||||
|
| $"string"$ | |
|
||||||
|
|--------------|-|
|
||||||
|
| "\n" | (default)
|
||||||
|
| "<#embSep#>" | for exemple
|
||||||
|
| "<#sep#>" | other exemple
|
||||||
|
|
||||||
|
## examples
|
||||||
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -7,23 +7,30 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static std::vector<std::string> split_lines(const std::string & s) {
|
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
|
||||||
std::string line;
|
|
||||||
std::vector<std::string> lines;
|
std::vector<std::string> lines;
|
||||||
std::stringstream ss(s);
|
size_t start = 0;
|
||||||
while (std::getline(ss, line)) {
|
size_t end = s.find(separator);
|
||||||
lines.push_back(line);
|
|
||||||
|
while (end != std::string::npos) {
|
||||||
|
lines.push_back(s.substr(start, end - start));
|
||||||
|
start = end + separator.length();
|
||||||
|
end = s.find(separator, start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lines.push_back(s.substr(start)); // Add the last part
|
||||||
|
|
||||||
return lines;
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
size_t n_tokens = tokens.size();
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
|
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
|
|
||||||
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
||||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||||
if (embd == NULL) {
|
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||||
embd = llama_get_embeddings_ith(ctx, i);
|
|
||||||
if (embd == NULL) {
|
|
||||||
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||||
//TODO: I would also add a parameter here to enable normalization or not.
|
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
||||||
/*fprintf(stdout, "unnormalized_embedding:");
|
|
||||||
for (int hh = 0; hh < n_embd; hh++) {
|
|
||||||
fprintf(stdout, "%9.6f ", embd[hh]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "\n");*/
|
|
||||||
llama_embd_normalize(embd, out, n_embd);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
|
@ -109,7 +110,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
std::vector<std::string> prompts = split_lines(params.prompt);
|
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
const uint64_t n_batch = params.n_batch;
|
const uint64_t n_batch = params.n_batch;
|
||||||
|
@ -169,7 +170,7 @@ int main(int argc, char ** argv) {
|
||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
if (batch.n_tokens + n_toks > n_batch) {
|
if (batch.n_tokens + n_toks > n_batch) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
p += s;
|
p += s;
|
||||||
s = 0;
|
s = 0;
|
||||||
|
@ -182,29 +183,78 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// final batch
|
// final batch
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
|
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
if (params.embd_out.empty()) {
|
||||||
fprintf(stdout, "\n");
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
|
||||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
|
fprintf(stdout, "embedding %d: ", j);
|
||||||
// print cosine similarity matrix
|
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||||
if (n_prompts > 1) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||||
printf("cosine similarity matrix:\n\n");
|
} else {
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
}
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
||||||
fprintf(stdout, "%6.2f ", sim);
|
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// print cosine similarity matrix
|
||||||
|
if (n_prompts > 1) {
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
printf("cosine similarity matrix:\n\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f ", sim);
|
||||||
|
}
|
||||||
|
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||||
|
const bool notArray = params.embd_out != "array";
|
||||||
|
|
||||||
|
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||||
|
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||||
|
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||||
|
fprintf(stdout, "[");
|
||||||
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||||
|
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||||
|
i++;
|
||||||
|
if (i < n_embd) fprintf(stdout, ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "]\n }" : "]");
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
||||||
|
|
||||||
|
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||||
|
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
||||||
|
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
fprintf(stdout, " [");
|
||||||
|
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f", sim);
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, ", "); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, " ]");
|
||||||
|
i++;
|
||||||
|
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n ]");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notArray) fprintf(stdout, "\n}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
|
|
|
@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
|
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
llama_set_embeddings(ctx, true);
|
||||||
llama_set_causal_attn(ctx, false);
|
llama_set_causal_attn(ctx, false);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
|
@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
||||||
llama_token eos_token = llama_token_eos(mdl);
|
llama_token eos_token = llama_token_eos(mdl);
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
llama_set_embeddings(ctx, false);
|
||||||
llama_set_causal_attn(ctx, true);
|
llama_set_causal_attn(ctx, true);
|
||||||
|
|
||||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
||||||
|
@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
// create new context - set to embedding mode
|
// create generation context
|
||||||
cparams.embeddings = true;
|
|
||||||
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
||||||
|
|
||||||
// ### Embedding/Representation ###
|
// ### Embedding/Representation ###
|
||||||
|
|
|
@ -131,22 +131,29 @@ class LlamaState: ObservableObject {
|
||||||
|
|
||||||
messageLog += "\(text)"
|
messageLog += "\(text)"
|
||||||
|
|
||||||
while await llamaContext.n_cur < llamaContext.n_len {
|
Task.detached {
|
||||||
let result = await llamaContext.completion_loop()
|
while await llamaContext.n_cur < llamaContext.n_len {
|
||||||
messageLog += "\(result)"
|
let result = await llamaContext.completion_loop()
|
||||||
|
await MainActor.run {
|
||||||
|
self.messageLog += "\(result)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let t_end = DispatchTime.now().uptimeNanoseconds
|
||||||
|
let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
|
||||||
|
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
|
||||||
|
|
||||||
|
await llamaContext.clear()
|
||||||
|
|
||||||
|
await MainActor.run {
|
||||||
|
self.messageLog += """
|
||||||
|
\n
|
||||||
|
Done
|
||||||
|
Heat up took \(t_heat)s
|
||||||
|
Generated \(tokens_per_second) t/s\n
|
||||||
|
"""
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let t_end = DispatchTime.now().uptimeNanoseconds
|
|
||||||
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
|
|
||||||
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
|
|
||||||
|
|
||||||
await llamaContext.clear()
|
|
||||||
messageLog += """
|
|
||||||
\n
|
|
||||||
Done
|
|
||||||
Heat up took \(t_heat)s
|
|
||||||
Generated \(tokens_per_second) t/s\n
|
|
||||||
"""
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func bench() async {
|
func bench() async {
|
||||||
|
|
|
@ -16,41 +16,41 @@ struct quant_option {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
|
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
|
||||||
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
||||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
|
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
|
||||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
|
||||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
|
||||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
|
||||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
||||||
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
||||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
|
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
|
||||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
|
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
||||||
|
|
|
@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
size_t n_tokens = tokens.size();
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
|
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
|
|
|
@ -634,12 +634,12 @@ return html`
|
||||||
<div>
|
<div>
|
||||||
<div class="grammar">
|
<div class="grammar">
|
||||||
<label for="template"></label>
|
<label for="template"></label>
|
||||||
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||||
</div>
|
</div>
|
||||||
<div class="grammar-columns">
|
<div class="grammar-columns">
|
||||||
<div class="json-schema-controls">
|
<div class="json-schema-controls">
|
||||||
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
||||||
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
|
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -1596,7 +1596,7 @@ struct server_context {
|
||||||
} else {
|
} else {
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||||
json_value(task.data, "prompt", std::string());
|
prompt = json_value(task.data, "prompt", std::string());
|
||||||
}
|
}
|
||||||
|
|
||||||
slot = get_available_slot(prompt);
|
slot = get_available_slot(prompt);
|
||||||
|
|
|
@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
:: for FP16
|
:: for FP16
|
||||||
:: faster for long-prompt inference
|
:: faster for long-prompt inference
|
||||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
:: for FP32
|
:: for FP32
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
:: build example/main only
|
:: build example/main only
|
||||||
:: make main
|
:: make main
|
||||||
|
|
||||||
:: build all binary
|
:: build all binary
|
||||||
make -j
|
cmake --build . -j
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
|
|
96
ggml-cuda.cu
96
ggml-cuda.cu
|
@ -152,16 +152,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
|
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||||
#else
|
#else
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||||
#endif
|
#endif // GGML_CUDA_FORCE_MMQ
|
||||||
#if defined(CUDA_USE_TENSOR_CORES)
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
||||||
#else
|
#else
|
||||||
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
||||||
#endif
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||||
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
|
||||||
}
|
}
|
||||||
|
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
|
||||||
}
|
}
|
||||||
return row_rounding;
|
return row_rounding;
|
||||||
}
|
}
|
||||||
|
@ -1873,9 +1873,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||||
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
|
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
|
bool any_gpus_with_slow_fp16 = false;
|
||||||
|
|
||||||
bool any_pascal_with_slow_fp16 = false;
|
|
||||||
if (split) {
|
if (split) {
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||||
auto & tensor_split = buft_ctx->tensor_split;
|
auto & tensor_split = buft_ctx->tensor_split;
|
||||||
|
@ -1885,55 +1893,18 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
||||||
}
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
if (ggml_cuda_info().devices[id].cc == 610) {
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
any_pascal_with_slow_fp16 = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
|
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||||
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
|
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
||||||
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
||||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
||||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
|
||||||
|
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
||||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
||||||
|
|
||||||
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
|
||||||
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
|
||||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
|
||||||
|
|
||||||
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
|
||||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
|
||||||
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
|
||||||
// when tensor cores are available, use them for large batch size
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
|
||||||
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
|
|
||||||
// if mmvq is available it's a better choice than dmmv:
|
// if mmvq is available it's a better choice than dmmv:
|
||||||
#ifndef GGML_CUDA_FORCE_DMMV
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||||
|
@ -1947,14 +1918,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||||
|
|
||||||
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// FP32 precision KQ single-batch for batch size 1 without FlashAttention
|
||||||
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
||||||
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
|
||||||
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
||||||
// KQ + KQV multi-batch
|
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
|
// KQ + KQV multi-batch without FlashAttention
|
||||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
||||||
|
|
|
@ -146,23 +146,6 @@
|
||||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
||||||
|
|
||||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
|
||||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
|
||||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
|
||||||
// - 7B quantum model: +100-200 MB
|
|
||||||
// - 13B quantum model: +200-400 MB
|
|
||||||
//
|
|
||||||
//#define GGML_CUDA_FORCE_MMQ
|
|
||||||
|
|
||||||
// TODO: improve this to be correct for more hardware
|
|
||||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
|
||||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
|
||||||
#define CUDA_USE_TENSOR_CORES
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
|
||||||
#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
|
|
||||||
|
|
||||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
|
@ -343,15 +326,15 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
|
||||||
#define INT8_MMA_AVAILABLE
|
#define INT8_MMA_AVAILABLE
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||||
|
|
||||||
static bool fast_fp16_available(const int cc) {
|
static constexpr bool fast_fp16_available(const int cc) {
|
||||||
return cc >= CC_PASCAL && cc != 610;
|
return cc >= CC_PASCAL && cc != 610;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool fp16_mma_available(const int cc) {
|
static constexpr bool fp16_mma_available(const int cc) {
|
||||||
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool int8_mma_available(const int cc) {
|
static constexpr bool int8_mma_available(const int cc) {
|
||||||
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
|
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -643,19 +626,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
|
||||||
static constexpr int qi = QI3_S;
|
static constexpr int qi = QI3_S;
|
||||||
};
|
};
|
||||||
|
|
||||||
static int get_mmq_x_max_host(const int cc) {
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
|
||||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
|
|
||||||
#else
|
|
||||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
|
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
|
||||||
}
|
|
||||||
|
|
||||||
// Round rows to this value for --split-mode row:
|
|
||||||
static int get_mmq_y_host(const int cc, const int mmq_x) {
|
|
||||||
return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
||||||
struct ggml_cuda_device_info {
|
struct ggml_cuda_device_info {
|
||||||
|
|
|
@ -20,6 +20,20 @@ struct mma_int_A_I16K4 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE)
|
||||||
|
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_A_I16K8 {
|
struct mma_int_A_I16K8 {
|
||||||
|
@ -42,6 +56,20 @@ struct mma_int_A_I16K8 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE)
|
||||||
|
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_B_J8K4 {
|
struct mma_int_B_J8K4 {
|
||||||
|
@ -64,6 +92,20 @@ struct mma_int_B_J8K4 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||||
|
const int * xs = xs0 + (threadIdx.x%J)*stride;
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
|
||||||
|
: "+r"(x[0])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_B_J8K8 {
|
struct mma_int_B_J8K8 {
|
||||||
|
@ -86,6 +128,20 @@ struct mma_int_B_J8K8 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||||
|
const int * xs = xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_C_I16J8 {
|
struct mma_int_C_I16J8 {
|
||||||
|
|
|
@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -69,7 +69,13 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
GGML_UNUSED(src1_ddf_i);
|
GGML_UNUSED(src1_ddf_i);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
|
return false;
|
||||||
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||||
|
|
||||||
|
bool mmq_supported;
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
|
@ -81,8 +87,32 @@ bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return true;
|
mmq_supported = true;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return false;
|
mmq_supported = false;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!mmq_supported) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (int8_mma_available(cc)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cc < MIN_CC_DP4A) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
|
return true;
|
||||||
|
#endif //GGML_CUDA_FORCE_MMQ
|
||||||
|
|
||||||
|
if (cc < CC_OFFSET_AMD) {
|
||||||
|
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
}
|
}
|
||||||
|
|
1822
ggml-cuda/mmq.cuh
1822
ggml-cuda/mmq.cuh
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,7 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||||
|
|
||||||
void ggml_cuda_op_mul_mat_vec_q(
|
void ggml_cuda_op_mul_mat_vec_q(
|
||||||
ggml_backend_cuda_context & ctx,
|
ggml_backend_cuda_context & ctx,
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||||
|
|
|
@ -735,6 +735,12 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
|
||||||
|
for (size_t i = 0, n = 3; i < n; ++i) {
|
||||||
|
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
|
|
699
ggml-quants.c
699
ggml-quants.c
|
@ -8814,7 +8814,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
|
#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
|
||||||
static const int8_t keven_signs_q2xs[1024] = {
|
static const int8_t keven_signs_q2xs[1024] = {
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
||||||
|
@ -8947,6 +8947,61 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
||||||
|
|
||||||
*s = 0.125f * hsum_float_8(accumf);
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||||
|
|
||||||
|
uint32_t aux32[4];
|
||||||
|
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint16_t * restrict q2 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
|
||||||
|
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
||||||
|
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
||||||
|
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
|
||||||
|
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
|
||||||
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
||||||
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
||||||
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
const uint16_t ls1 = aux32[1] >> 28;
|
||||||
|
const uint16_t ls2 = aux32[3] >> 28;
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const vector int v0 = vec_splats((int32_t)0);
|
const vector int v0 = vec_splats((int32_t)0);
|
||||||
vector float vsumf0 = vec_splats(0.0f);
|
vector float vsumf0 = vec_splats(0.0f);
|
||||||
|
@ -9290,6 +9345,165 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = 0.125f * hsum_float_8(accumf);
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
const __m128i mone = _mm_set1_epi8(1);
|
||||||
|
static const char block_sign_shuffle_mask_1[32] = {
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||||
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
||||||
|
};
|
||||||
|
static const char block_sign_shuffle_mask_2[32] = {
|
||||||
|
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
|
||||||
|
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
|
||||||
|
};
|
||||||
|
static const uint8_t bit_selector_mask_bytes[32] = {
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
};
|
||||||
|
|
||||||
|
const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
|
||||||
|
const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
|
||||||
|
const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
|
||||||
|
const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
|
||||||
|
const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
|
||||||
|
const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
|
||||||
|
|
||||||
|
static const uint8_t k_bit_helper[32] = {
|
||||||
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||||
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||||
|
};
|
||||||
|
const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
||||||
|
const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
|
||||||
|
const __m128i m511 = _mm_set1_epi16(511);
|
||||||
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
||||||
|
const __m128i m1 = _mm_set1_epi8(1);
|
||||||
|
|
||||||
|
uint64_t aux64;
|
||||||
|
|
||||||
|
// somewhat hacky, but gives a significant boost in performance
|
||||||
|
__m256i aux_gindex;
|
||||||
|
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint16_t * restrict q2 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
memcpy(&aux64, x[i].scales, 8);
|
||||||
|
__m128i stmp = _mm_set1_epi64x(aux64);
|
||||||
|
stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
|
||||||
|
const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
|
||||||
|
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
|
||||||
|
|
||||||
|
const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
|
||||||
|
const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
|
||||||
|
aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
|
||||||
|
|
||||||
|
const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
|
||||||
|
const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
|
||||||
|
const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
|
||||||
|
const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
|
||||||
|
const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
|
||||||
|
const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
|
||||||
|
|
||||||
|
const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
|
||||||
|
const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
|
||||||
|
const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
|
||||||
|
const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
|
||||||
|
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
|
||||||
|
const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
|
||||||
|
const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
|
||||||
|
const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
|
||||||
|
const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
|
||||||
|
|
||||||
|
// AVX2 full_signs_1 is full_sign_bits_0 here
|
||||||
|
// AVX2 full_signs_2 is full_sign_bits_1 here
|
||||||
|
__m128i signs_0, signs_1;
|
||||||
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
|
||||||
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
|
||||||
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
||||||
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
||||||
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
|
||||||
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
|
||||||
|
|
||||||
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
|
||||||
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
|
||||||
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
||||||
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
|
||||||
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
|
||||||
|
|
||||||
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
|
||||||
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
|
||||||
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
||||||
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
||||||
|
const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
|
||||||
|
const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
|
||||||
|
|
||||||
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
|
||||||
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
|
||||||
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
||||||
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
||||||
|
const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
|
||||||
|
const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
|
||||||
|
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
|
||||||
|
const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
|
||||||
|
const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
|
||||||
|
const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
|
||||||
|
|
||||||
|
__m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
|
||||||
|
const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
|
||||||
|
const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
||||||
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
|
||||||
|
const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
|
||||||
|
const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
||||||
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
|
||||||
|
const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
|
||||||
|
const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
||||||
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
|
||||||
|
const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
|
||||||
|
const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
||||||
|
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__loongarch_asx)
|
#elif defined(__loongarch_asx)
|
||||||
|
|
||||||
const __m256i mone = __lasx_xvreplgr2vr_b(1);
|
const __m256i mone = __lasx_xvreplgr2vr_b(1);
|
||||||
|
@ -9693,6 +9907,98 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
*s = 0.125f * hsum_float_8(accumf);
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
};
|
||||||
|
|
||||||
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
||||||
|
const __m128i m1 = _mm_set1_epi8(1);
|
||||||
|
|
||||||
|
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
||||||
|
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
||||||
|
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
||||||
|
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
||||||
|
|
||||||
|
uint64_t aux64;
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint8_t * restrict qs = x[i].qs;
|
||||||
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
|
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
memcpy(&aux64, x[i].scales, 8);
|
||||||
|
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
||||||
|
const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
|
||||||
|
const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
|
||||||
|
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
||||||
|
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
||||||
|
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
||||||
|
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
||||||
|
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
|
||||||
|
qs += 8;
|
||||||
|
|
||||||
|
__m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
|
||||||
|
__m128i aux128_1 = aux128_0;
|
||||||
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
||||||
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
||||||
|
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
||||||
|
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
||||||
|
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
||||||
|
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
||||||
|
|
||||||
|
aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
|
||||||
|
aux128_1 = aux128_0;
|
||||||
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
||||||
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
||||||
|
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
||||||
|
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
||||||
|
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
||||||
|
|
||||||
|
signs += 4;
|
||||||
|
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||||
|
@ -10019,6 +10325,63 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
||||||
|
|
||||||
*s = 0.25f * hsum_float_8(accumf);
|
*s = 0.25f * hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||||
|
|
||||||
|
uint32_t aux32[2];
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint8_t * restrict q3 = x[i].qs;
|
||||||
|
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
||||||
|
q3 += 8;
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
||||||
|
q3 += 8;
|
||||||
|
memcpy(aux32, gas, 8); gas += 8;
|
||||||
|
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
||||||
|
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
|
||||||
|
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
||||||
|
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
||||||
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
||||||
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
||||||
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
const uint16_t ls1 = aux32[0] >> 28;
|
||||||
|
const uint16_t ls2 = aux32[1] >> 28;
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.25f * hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||||
|
|
||||||
|
@ -10370,6 +10733,112 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
*s = hsum_float_8(accumf);
|
*s = hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
};
|
||||||
|
|
||||||
|
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
||||||
|
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
||||||
|
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
||||||
|
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
||||||
|
|
||||||
|
const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
|
||||||
|
const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
|
||||||
|
const __m128i idx_mask = _mm_set1_epi32(256);
|
||||||
|
|
||||||
|
typedef union {
|
||||||
|
__m128i vec[4];
|
||||||
|
uint32_t index[16];
|
||||||
|
} index_t;
|
||||||
|
|
||||||
|
index_t idx;
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint8_t * restrict qs = x[i].qs;
|
||||||
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
|
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
|
||||||
|
const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
|
||||||
|
const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
|
||||||
|
idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
|
||||||
|
idx.vec[1] = idx.vec[0];
|
||||||
|
idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
|
||||||
|
idx.vec[3] = idx.vec[2];
|
||||||
|
|
||||||
|
idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
|
||||||
|
idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
|
||||||
|
idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
|
||||||
|
idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
|
||||||
|
|
||||||
|
idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
|
||||||
|
idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
|
||||||
|
idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
|
||||||
|
idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
|
||||||
|
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
|
||||||
|
|
||||||
|
__m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
|
||||||
|
__m128i aux128_1 = aux128_0;
|
||||||
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
||||||
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
||||||
|
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
||||||
|
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
||||||
|
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
||||||
|
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
||||||
|
|
||||||
|
aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
|
||||||
|
aux128_1 = aux128_0;
|
||||||
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
||||||
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
||||||
|
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
||||||
|
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
||||||
|
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
||||||
|
|
||||||
|
signs += 4;
|
||||||
|
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
||||||
|
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||||
|
@ -10607,6 +11076,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX__)
|
||||||
|
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||||
|
const __m128i ax = _mm_sign_epi8(x, x);
|
||||||
|
const __m128i sy = _mm_sign_epi8(y, x);
|
||||||
|
return _mm_maddubs_epi16(ax, sy);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
||||||
const __m256i ax = _mm256_sign_epi8(x, x);
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
||||||
|
@ -10724,6 +11201,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
||||||
|
|
||||||
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||||
|
|
||||||
|
#elif defined __AVX__
|
||||||
|
__m256 accum = _mm256_setzero_ps();
|
||||||
|
float accum1 = 0;
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * qs = x[i].qs;
|
||||||
|
const uint16_t * qh = x[i].qh;
|
||||||
|
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
int sumi1 = 0;
|
||||||
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||||
|
const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
|
||||||
|
const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
|
||||||
|
qs += 8;
|
||||||
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
|
||||||
|
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
||||||
|
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
||||||
|
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
||||||
|
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
||||||
|
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
||||||
|
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
|
||||||
|
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
||||||
|
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
||||||
|
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
|
||||||
|
accum1 += d * sumi1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
||||||
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
||||||
|
@ -11062,6 +11587,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
||||||
|
|
||||||
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
||||||
|
|
||||||
|
#elif defined __AVX__
|
||||||
|
const __m128i mask = _mm_set1_epi16(0x7);
|
||||||
|
const __m128i mone = _mm_set1_epi16(1);
|
||||||
|
|
||||||
|
__m256 accum1 = _mm256_setzero_ps();
|
||||||
|
__m256 accum2 = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * qs = x[i].qs;
|
||||||
|
const uint8_t * qh = x[i].qh;
|
||||||
|
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||||
|
|
||||||
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||||
|
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||||
|
const __m128i q1b_1_0 = _mm_set_epi64x(
|
||||||
|
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_1_1 = _mm_set_epi64x(
|
||||||
|
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_2_0 = _mm_set_epi64x(
|
||||||
|
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_2_1 = _mm_set_epi64x(
|
||||||
|
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
|
||||||
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
|
||||||
|
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
||||||
|
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
||||||
|
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
||||||
|
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
||||||
|
|
||||||
|
const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||||
|
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||||
|
const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||||
|
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||||
|
const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||||
|
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||||
|
const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||||
|
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||||
|
|
||||||
|
const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
|
||||||
|
const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
|
||||||
|
const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
|
||||||
|
const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
|
||||||
|
|
||||||
|
__m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
|
||||||
|
__m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
|
||||||
|
__m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
|
||||||
|
__m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
|
||||||
|
|
||||||
|
scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
|
||||||
|
scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
|
||||||
|
scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
|
||||||
|
scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
|
||||||
|
const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
|
||||||
|
const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
|
||||||
|
const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
|
||||||
|
const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
|
||||||
|
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
|
||||||
|
|
||||||
|
qs += 8; qh += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
||||||
|
|
||||||
|
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
|
||||||
|
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
int sum1[2], sum2[2], delta[4];
|
int sum1[2], sum2[2], delta[4];
|
||||||
|
@ -11192,6 +11803,44 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||||
|
|
||||||
|
#elif defined __AVX__
|
||||||
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||||
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||||
|
const __m128i mone = _mm_set1_epi16(1);
|
||||||
|
|
||||||
|
__m256 accum1 = _mm256_setzero_ps();
|
||||||
|
__m256 accum2 = _mm256_setzero_ps();
|
||||||
|
for (int ib = 0; ib < nb; ib += 2) {
|
||||||
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
||||||
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
||||||
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
|
||||||
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1);
|
||||||
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs);
|
||||||
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1);
|
||||||
|
|
||||||
|
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
||||||
|
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||||
|
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||||
|
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||||
|
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||||
|
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||||
|
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||||
|
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||||
|
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||||
|
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||||
|
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||||
|
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||||
|
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
||||||
|
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||||
|
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
||||||
|
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||||
|
|
||||||
|
y += 2;
|
||||||
|
x += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||||
const vector signed int v0 = vec_splats((int32_t)0);
|
const vector signed int v0 = vec_splats((int32_t)0);
|
||||||
|
@ -11382,6 +12031,54 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
*s = hsum_float_8(accum);
|
*s = hsum_float_8(accum);
|
||||||
|
|
||||||
|
#elif defined __AVX__
|
||||||
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||||
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||||
|
|
||||||
|
__m256 accum = _mm256_setzero_ps();
|
||||||
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||||
|
const uint8_t * qs = x[ibl].qs;
|
||||||
|
const int8_t * q8 = y[ibl].qs;
|
||||||
|
uint16_t sh = x[ibl].scales_h;
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||||
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
||||||
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
||||||
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
||||||
|
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||||
|
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||||
|
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||||
|
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||||
|
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||||
|
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||||
|
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||||
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
||||||
|
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
||||||
|
sh >>= 4;
|
||||||
|
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
|
||||||
|
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
|
||||||
|
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
|
||||||
|
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
|
||||||
|
sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
|
||||||
|
}
|
||||||
|
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
|
||||||
|
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
|
||||||
|
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||||
|
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accum);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||||
const vector int v0 = vec_splats((int32_t)0);
|
const vector int v0 = vec_splats((int32_t)0);
|
||||||
|
|
|
@ -4620,7 +4620,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// KQV single-batch
|
||||||
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
|
@ -4911,7 +4911,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
GGML_TENSOR_BINARY_OP_LOCALS01;
|
||||||
|
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||||
queue_ptr main_stream = ctx.stream();
|
queue_ptr main_stream = ctx.stream();
|
||||||
|
|
|
@ -588,266 +588,222 @@ namespace dpct
|
||||||
out = prop;
|
out = prop;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// dpct device extension
|
/// dpct device extension
|
||||||
class device_ext : public sycl::device
|
class device_ext : public sycl::device {
|
||||||
{
|
typedef std::mutex mutex_type;
|
||||||
typedef std::mutex mutex_type;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
device_ext() : sycl::device(), _ctx(*this) {}
|
device_ext() : sycl::device() {}
|
||||||
~device_ext()
|
~device_ext() {
|
||||||
{
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
clear_queues();
|
||||||
clear_queues();
|
}
|
||||||
}
|
device_ext(const sycl::device &base) : sycl::device(base) {
|
||||||
device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
{
|
init_queues();
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
}
|
||||||
init_queues();
|
|
||||||
}
|
|
||||||
|
|
||||||
int is_native_atomic_supported() { return 0; }
|
int is_native_atomic_supported() { return 0; }
|
||||||
int get_major_version() const
|
int get_major_version() const { return dpct::get_major_version(*this); }
|
||||||
{
|
|
||||||
return dpct::get_major_version(*this);
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_minor_version() const
|
int get_minor_version() const { return dpct::get_minor_version(*this); }
|
||||||
{
|
|
||||||
return dpct::get_minor_version(*this);
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_max_compute_units() const
|
int get_max_compute_units() const {
|
||||||
{
|
return get_device_info().get_max_compute_units();
|
||||||
return get_device_info().get_max_compute_units();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the maximum clock frequency of this device in KHz.
|
/// Return the maximum clock frequency of this device in KHz.
|
||||||
int get_max_clock_frequency() const
|
int get_max_clock_frequency() const {
|
||||||
{
|
return get_device_info().get_max_clock_frequency();
|
||||||
return get_device_info().get_max_clock_frequency();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int get_integrated() const { return get_device_info().get_integrated(); }
|
int get_integrated() const { return get_device_info().get_integrated(); }
|
||||||
|
|
||||||
int get_max_sub_group_size() const
|
int get_max_sub_group_size() const {
|
||||||
{
|
return get_device_info().get_max_sub_group_size();
|
||||||
return get_device_info().get_max_sub_group_size();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int get_max_register_size_per_work_group() const
|
int get_max_register_size_per_work_group() const {
|
||||||
{
|
return get_device_info().get_max_register_size_per_work_group();
|
||||||
return get_device_info().get_max_register_size_per_work_group();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int get_max_work_group_size() const
|
int get_max_work_group_size() const {
|
||||||
{
|
return get_device_info().get_max_work_group_size();
|
||||||
return get_device_info().get_max_work_group_size();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int get_mem_base_addr_align() const
|
int get_mem_base_addr_align() const {
|
||||||
{
|
return get_info<sycl::info::device::mem_base_addr_align>();
|
||||||
return get_info<sycl::info::device::mem_base_addr_align>();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
size_t get_global_mem_size() const
|
size_t get_global_mem_size() const {
|
||||||
{
|
return get_device_info().get_global_mem_size();
|
||||||
return get_device_info().get_global_mem_size();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
size_t get_max_mem_alloc_size() const
|
size_t get_max_mem_alloc_size() const {
|
||||||
{
|
return get_device_info().get_max_mem_alloc_size();
|
||||||
return get_device_info().get_max_mem_alloc_size();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the number of bytes of free and total memory on the SYCL device.
|
/// Get the number of bytes of free and total memory on the SYCL device.
|
||||||
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
|
/// \param [out] free_memory The number of bytes of free memory on the
|
||||||
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
|
/// SYCL device. \param [out] total_memory The number of bytes of total
|
||||||
void get_memory_info(size_t &free_memory, size_t &total_memory)
|
/// memory on the SYCL device.
|
||||||
{
|
void get_memory_info(size_t &free_memory, size_t &total_memory) {
|
||||||
total_memory = get_device_info().get_global_mem_size();
|
total_memory = get_device_info().get_global_mem_size();
|
||||||
const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
|
const char *warning_info =
|
||||||
"supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
|
"get_memory_info: [warning] ext_intel_free_memory is not "
|
||||||
"use total memory as free memory";
|
"supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
|
||||||
|
"use total memory as free memory";
|
||||||
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
|
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
|
||||||
if (!has(sycl::aspect::ext_intel_free_memory))
|
if (!has(sycl::aspect::ext_intel_free_memory)) {
|
||||||
{
|
std::cerr << warning_info << std::endl;
|
||||||
std::cerr << warning_info << std::endl;
|
free_memory = total_memory;
|
||||||
free_memory = total_memory;
|
} else {
|
||||||
}
|
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
||||||
else
|
}
|
||||||
{
|
|
||||||
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
std::cerr << warning_info << std::endl;
|
std::cerr << warning_info << std::endl;
|
||||||
free_memory = total_memory;
|
free_memory = total_memory;
|
||||||
#if defined(_MSC_VER) && !defined(__clang__)
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
#pragma message("Querying the number of bytes of free memory is not supported")
|
#pragma message("Querying the number of bytes of free memory is not supported")
|
||||||
#else
|
#else
|
||||||
#warning "Querying the number of bytes of free memory is not supported"
|
#warning "Querying the number of bytes of free memory is not supported"
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_device_info(device_info &out) const {
|
||||||
|
dpct::get_device_info(out, *this);
|
||||||
|
}
|
||||||
|
|
||||||
|
device_info get_device_info() const {
|
||||||
|
device_info prop;
|
||||||
|
dpct::get_device_info(prop, *this);
|
||||||
|
return prop;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset() {
|
||||||
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
|
clear_queues();
|
||||||
|
init_queues();
|
||||||
|
}
|
||||||
|
|
||||||
|
sycl::queue &in_order_queue() { return _q_in_order; }
|
||||||
|
|
||||||
|
sycl::queue &out_of_order_queue() { return _q_out_of_order; }
|
||||||
|
|
||||||
|
sycl::queue &default_queue() { return in_order_queue(); }
|
||||||
|
|
||||||
|
void queues_wait_and_throw() {
|
||||||
|
std::unique_lock<mutex_type> lock(m_mutex);
|
||||||
|
lock.unlock();
|
||||||
|
for (auto &q : _queues) {
|
||||||
|
q.wait_and_throw();
|
||||||
}
|
}
|
||||||
|
// Guard the destruct of current_queues to make sure the ref count is
|
||||||
|
// safe.
|
||||||
|
lock.lock();
|
||||||
|
}
|
||||||
|
|
||||||
void get_device_info(device_info &out) const
|
sycl::queue create_queue(bool enable_exception_handler = false) {
|
||||||
{
|
return create_in_order_queue(enable_exception_handler);
|
||||||
dpct::get_device_info(out, *this);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
device_info get_device_info() const
|
sycl::queue create_queue(sycl::device device,
|
||||||
{
|
bool enable_exception_handler = false) {
|
||||||
device_info prop;
|
return create_in_order_queue(device, enable_exception_handler);
|
||||||
dpct::get_device_info(prop, *this);
|
}
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
|
|
||||||
void reset()
|
sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
|
||||||
{
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
return create_queue_impl(enable_exception_handler,
|
||||||
clear_queues();
|
sycl::property::queue::in_order());
|
||||||
init_queues();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue &in_order_queue() { return *_q_in_order; }
|
sycl::queue create_in_order_queue(sycl::device device,
|
||||||
|
|
||||||
sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
|
|
||||||
|
|
||||||
sycl::queue &default_queue()
|
|
||||||
{
|
|
||||||
return in_order_queue();
|
|
||||||
}
|
|
||||||
|
|
||||||
void queues_wait_and_throw()
|
|
||||||
{
|
|
||||||
std::unique_lock<mutex_type> lock(m_mutex);
|
|
||||||
std::vector<std::shared_ptr<sycl::queue>> current_queues(
|
|
||||||
_queues);
|
|
||||||
lock.unlock();
|
|
||||||
for (const auto &q : current_queues)
|
|
||||||
{
|
|
||||||
q->wait_and_throw();
|
|
||||||
}
|
|
||||||
// Guard the destruct of current_queues to make sure the ref count is safe.
|
|
||||||
lock.lock();
|
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue *create_queue(bool enable_exception_handler = false)
|
|
||||||
{
|
|
||||||
return create_in_order_queue(enable_exception_handler);
|
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
|
||||||
bool enable_exception_handler = false) {
|
|
||||||
return create_in_order_queue(context, device, enable_exception_handler);
|
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
|
||||||
return create_queue_impl(enable_exception_handler,
|
|
||||||
sycl::property::queue::in_order());
|
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue *create_in_order_queue(sycl::context context, sycl::device device,
|
|
||||||
bool enable_exception_handler = false) {
|
bool enable_exception_handler = false) {
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
return create_queue_impl(context, device, enable_exception_handler,
|
return create_queue_impl(device, enable_exception_handler,
|
||||||
sycl::property::queue::in_order());
|
sycl::property::queue::in_order());
|
||||||
}
|
}
|
||||||
|
|
||||||
sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
|
sycl::queue create_out_of_order_queue(
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
bool enable_exception_handler = false) {
|
||||||
return create_queue_impl(enable_exception_handler);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
}
|
return create_queue_impl(enable_exception_handler);
|
||||||
|
}
|
||||||
|
|
||||||
void destroy_queue(sycl::queue *&queue)
|
void destroy_queue(sycl::queue queue) {
|
||||||
{
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
_queues.clear();
|
||||||
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
}
|
||||||
[=](const std::shared_ptr<sycl::queue> &q) -> bool
|
void set_saved_queue(sycl::queue q) {
|
||||||
{
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
return q.get() == queue;
|
_saved_queue = q;
|
||||||
}),
|
}
|
||||||
_queues.end());
|
sycl::queue get_saved_queue() const {
|
||||||
queue = nullptr;
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
}
|
return _saved_queue;
|
||||||
void set_saved_queue(sycl::queue *q)
|
}
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
|
||||||
_saved_queue = q;
|
|
||||||
}
|
|
||||||
sycl::queue *get_saved_queue() const
|
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
|
||||||
return _saved_queue;
|
|
||||||
}
|
|
||||||
sycl::context get_context() const { return _ctx; }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void clear_queues()
|
void clear_queues() { _queues.clear(); }
|
||||||
{
|
|
||||||
_queues.clear();
|
|
||||||
_q_in_order = _q_out_of_order = _saved_queue = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void init_queues()
|
void init_queues() {
|
||||||
{
|
_q_in_order =
|
||||||
_q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
|
create_queue_impl(true, sycl::property::queue::in_order());
|
||||||
_q_out_of_order = create_queue_impl(true);
|
_q_out_of_order = create_queue_impl(true);
|
||||||
_saved_queue = &default_queue();
|
_saved_queue = default_queue();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Caller should acquire resource \p m_mutex before calling this function.
|
/// Caller should acquire resource \p m_mutex before calling this
|
||||||
template <class... Properties>
|
/// function.
|
||||||
sycl::queue *create_queue_impl(bool enable_exception_handler,
|
template <class... Properties>
|
||||||
Properties... properties)
|
sycl::queue create_queue_impl(bool enable_exception_handler,
|
||||||
{
|
Properties... properties) {
|
||||||
sycl::async_handler eh = {};
|
sycl::async_handler eh = {};
|
||||||
if (enable_exception_handler)
|
if (enable_exception_handler) {
|
||||||
{
|
eh = exception_handler;
|
||||||
eh = exception_handler;
|
}
|
||||||
}
|
auto q = sycl::queue(*this, eh,
|
||||||
_queues.push_back(std::make_shared<sycl::queue>(
|
sycl::property_list(
|
||||||
_ctx, *this, eh,
|
|
||||||
sycl::property_list(
|
|
||||||
#ifdef DPCT_PROFILING_ENABLED
|
#ifdef DPCT_PROFILING_ENABLED
|
||||||
sycl::property::queue::enable_profiling(),
|
sycl::property::queue::enable_profiling(),
|
||||||
#endif
|
#endif
|
||||||
properties...)));
|
properties...));
|
||||||
|
_queues.push_back(q);
|
||||||
|
|
||||||
return _queues.back().get();
|
return _queues.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class... Properties>
|
template <class... Properties>
|
||||||
sycl::queue *create_queue_impl(sycl::context context, sycl::device device,
|
sycl::queue create_queue_impl(sycl::device device,
|
||||||
bool enable_exception_handler,
|
bool enable_exception_handler,
|
||||||
Properties... properties) {
|
Properties... properties) {
|
||||||
sycl::async_handler eh = {};
|
sycl::async_handler eh = {};
|
||||||
if (enable_exception_handler) {
|
if (enable_exception_handler) {
|
||||||
eh = exception_handler;
|
eh = exception_handler;
|
||||||
}
|
|
||||||
_queues.push_back(std::make_shared<sycl::queue>(
|
|
||||||
context, device, eh,
|
|
||||||
sycl::property_list(
|
|
||||||
#ifdef DPCT_PROFILING_ENABLED
|
|
||||||
sycl::property::queue::enable_profiling(),
|
|
||||||
#endif
|
|
||||||
properties...)));
|
|
||||||
|
|
||||||
return _queues.back().get();
|
|
||||||
}
|
}
|
||||||
|
_queues.push_back(
|
||||||
|
sycl::queue(device, eh,
|
||||||
|
sycl::property_list(
|
||||||
|
#ifdef DPCT_PROFILING_ENABLED
|
||||||
|
sycl::property::queue::enable_profiling(),
|
||||||
|
#endif
|
||||||
|
properties...)));
|
||||||
|
|
||||||
void get_version(int &major, int &minor) const
|
return _queues.back();
|
||||||
{
|
}
|
||||||
detail::get_version(*this, major, minor);
|
|
||||||
}
|
void get_version(int &major, int &minor) const {
|
||||||
sycl::queue *_q_in_order, *_q_out_of_order;
|
detail::get_version(*this, major, minor);
|
||||||
sycl::queue *_saved_queue;
|
}
|
||||||
sycl::context _ctx;
|
sycl::queue _q_in_order, _q_out_of_order;
|
||||||
std::vector<std::shared_ptr<sycl::queue>> _queues;
|
sycl::queue _saved_queue;
|
||||||
mutable mutex_type m_mutex;
|
std::vector<sycl::queue> _queues;
|
||||||
|
mutable mutex_type m_mutex;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/// device manager
|
/// device manager
|
||||||
class dev_mgr
|
class dev_mgr
|
||||||
{
|
{
|
||||||
|
|
39661
ggml-vulkan-shaders.hpp
39661
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
2091
ggml-vulkan.cpp
2091
ggml-vulkan.cpp
File diff suppressed because it is too large
Load diff
41
ggml.h
41
ggml.h
|
@ -312,6 +312,12 @@
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||||
|
|
||||||
|
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -585,11 +591,7 @@ extern "C" {
|
||||||
struct ggml_tensor * grad;
|
struct ggml_tensor * grad;
|
||||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||||
|
|
||||||
// performance
|
// source tensor and offset for views
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
|
|
||||||
struct ggml_tensor * view_src;
|
struct ggml_tensor * view_src;
|
||||||
size_t view_offs;
|
size_t view_offs;
|
||||||
|
|
||||||
|
@ -599,7 +601,7 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[8];
|
// char padding[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -646,11 +648,6 @@ extern "C" {
|
||||||
struct ggml_hash_set visited_hash_table;
|
struct ggml_hash_set visited_hash_table;
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
// performance
|
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
|
@ -667,28 +664,6 @@ extern "C" {
|
||||||
bool no_alloc; // don't allocate memory for the tensor data
|
bool no_alloc; // don't allocate memory for the tensor data
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// compute types
|
|
||||||
|
|
||||||
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
|
||||||
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
|
||||||
enum ggml_task_type {
|
|
||||||
GGML_TASK_TYPE_INIT = 0,
|
|
||||||
GGML_TASK_TYPE_COMPUTE,
|
|
||||||
GGML_TASK_TYPE_FINALIZE,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_compute_params {
|
|
||||||
enum ggml_task_type type;
|
|
||||||
|
|
||||||
// ith = thread index, nth = number of threads
|
|
||||||
int ith, nth;
|
|
||||||
|
|
||||||
// work buffer for all threads
|
|
||||||
size_t wsize;
|
|
||||||
void * wdata;
|
|
||||||
};
|
|
||||||
|
|
||||||
// numa strategies
|
// numa strategies
|
||||||
enum ggml_numa_strategy {
|
enum ggml_numa_strategy {
|
||||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
|
|
@ -49,6 +49,7 @@ class Keys:
|
||||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -62,6 +63,7 @@ class Keys:
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -73,6 +75,11 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||||
|
|
||||||
|
class Split:
|
||||||
|
LLM_KV_SPLIT_NO = "split.no"
|
||||||
|
LLM_KV_SPLIT_COUNT = "split.count"
|
||||||
|
LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
|
||||||
|
|
||||||
class SSM:
|
class SSM:
|
||||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||||
INNER_SIZE = "{arch}.ssm.inner_size"
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||||
|
@ -80,33 +87,35 @@ class Keys:
|
||||||
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
PRE = "tokenizer.ggml.pre"
|
PRE = "tokenizer.ggml.pre"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
||||||
SCORES = "tokenizer.ggml.scores"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
CLS_ID = "tokenizer.ggml.cls_token_id"
|
CLS_ID = "tokenizer.ggml.cls_token_id"
|
||||||
MASK_ID = "tokenizer.ggml.mask_token_id"
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
||||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
||||||
|
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
||||||
# FIM/Infill special tokens constants
|
# FIM/Infill special tokens constants
|
||||||
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
||||||
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
||||||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||||
EOT_ID = "tokenizer.ggml.eot_token_id"
|
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -115,91 +124,123 @@ class Keys:
|
||||||
|
|
||||||
|
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
LLAMA = auto()
|
LLAMA = auto()
|
||||||
FALCON = auto()
|
FALCON = auto()
|
||||||
BAICHUAN = auto()
|
BAICHUAN = auto()
|
||||||
GROK = auto()
|
GROK = auto()
|
||||||
GPT2 = auto()
|
GPT2 = auto()
|
||||||
GPTJ = auto()
|
GPTJ = auto()
|
||||||
GPTNEOX = auto()
|
GPTNEOX = auto()
|
||||||
MPT = auto()
|
MPT = auto()
|
||||||
STARCODER = auto()
|
STARCODER = auto()
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
JINA_BERT_V2 = auto()
|
JINA_BERT_V2 = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
QWEN2 = auto()
|
QWEN2 = auto()
|
||||||
QWEN2MOE = auto()
|
QWEN2MOE = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
PHI3 = auto()
|
PHI3 = auto()
|
||||||
PLAMO = auto()
|
PLAMO = auto()
|
||||||
CODESHELL = auto()
|
CODESHELL = auto()
|
||||||
ORION = auto()
|
ORION = auto()
|
||||||
INTERNLM2 = auto()
|
INTERNLM2 = auto()
|
||||||
MINICPM = auto()
|
MINICPM = auto()
|
||||||
GEMMA = auto()
|
GEMMA = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
XVERSE = auto()
|
XVERSE = auto()
|
||||||
COMMAND_R = auto()
|
COMMAND_R = auto()
|
||||||
DBRX = auto()
|
DBRX = auto()
|
||||||
OLMO = auto()
|
OLMO = auto()
|
||||||
ARCTIC = auto()
|
ARCTIC = auto()
|
||||||
DEEPSEEK2 = auto()
|
DEEPSEEK2 = auto()
|
||||||
|
BITNET = auto()
|
||||||
|
T5 = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD = auto()
|
TOKEN_EMBD = auto()
|
||||||
TOKEN_EMBD_NORM = auto()
|
TOKEN_EMBD_NORM = auto()
|
||||||
TOKEN_TYPES = auto()
|
TOKEN_TYPES = auto()
|
||||||
POS_EMBD = auto()
|
POS_EMBD = auto()
|
||||||
OUTPUT = auto()
|
OUTPUT = auto()
|
||||||
OUTPUT_NORM = auto()
|
OUTPUT_NORM = auto()
|
||||||
ROPE_FREQS = auto()
|
ROPE_FREQS = auto()
|
||||||
ROPE_FACTORS_LONG = auto()
|
ROPE_FACTORS_LONG = auto()
|
||||||
ROPE_FACTORS_SHORT = auto()
|
ROPE_FACTORS_SHORT = auto()
|
||||||
ATTN_Q = auto()
|
ATTN_Q = auto()
|
||||||
ATTN_K = auto()
|
ATTN_K = auto()
|
||||||
ATTN_V = auto()
|
ATTN_V = auto()
|
||||||
ATTN_QKV = auto()
|
ATTN_QKV = auto()
|
||||||
ATTN_OUT = auto()
|
ATTN_OUT = auto()
|
||||||
ATTN_NORM = auto()
|
ATTN_NORM = auto()
|
||||||
ATTN_NORM_2 = auto()
|
ATTN_NORM_2 = auto()
|
||||||
ATTN_OUT_NORM = auto()
|
ATTN_OUT_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
FFN_GATE_INP_SHEXP = auto()
|
FFN_GATE_INP_SHEXP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
FFN_UP = auto()
|
FFN_UP = auto()
|
||||||
FFN_ACT = auto()
|
FFN_ACT = auto()
|
||||||
FFN_NORM_EXP = auto()
|
FFN_NORM_EXP = auto()
|
||||||
FFN_GATE_EXP = auto()
|
FFN_GATE_EXP = auto()
|
||||||
FFN_DOWN_EXP = auto()
|
FFN_DOWN_EXP = auto()
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
FFN_GATE_SHEXP = auto()
|
FFN_GATE_SHEXP = auto()
|
||||||
FFN_DOWN_SHEXP = auto()
|
FFN_DOWN_SHEXP = auto()
|
||||||
FFN_UP_SHEXP = auto()
|
FFN_UP_SHEXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
SSM_IN = auto()
|
SSM_IN = auto()
|
||||||
SSM_CONV1D = auto()
|
SSM_CONV1D = auto()
|
||||||
SSM_X = auto()
|
SSM_X = auto()
|
||||||
SSM_DT = auto()
|
SSM_DT = auto()
|
||||||
SSM_A = auto()
|
SSM_A = auto()
|
||||||
SSM_D = auto()
|
SSM_D = auto()
|
||||||
SSM_OUT = auto()
|
SSM_OUT = auto()
|
||||||
ATTN_Q_A = auto()
|
ATTN_Q_A = auto()
|
||||||
ATTN_Q_B = auto()
|
ATTN_Q_B = auto()
|
||||||
ATTN_KV_A_MQA = auto()
|
ATTN_KV_A_MQA = auto()
|
||||||
ATTN_KV_B = auto()
|
ATTN_KV_B = auto()
|
||||||
ATTN_Q_A_NORM = auto()
|
ATTN_Q_A_NORM = auto()
|
||||||
ATTN_KV_A_NORM = auto()
|
ATTN_KV_A_NORM = auto()
|
||||||
|
FFN_SUB_NORM = auto()
|
||||||
|
ATTN_SUB_NORM = auto()
|
||||||
|
DEC_ATTN_NORM = auto()
|
||||||
|
DEC_ATTN_Q = auto()
|
||||||
|
DEC_ATTN_K = auto()
|
||||||
|
DEC_ATTN_V = auto()
|
||||||
|
DEC_ATTN_OUT = auto()
|
||||||
|
DEC_ATTN_REL_B = auto()
|
||||||
|
DEC_CROSS_ATTN_NORM = auto()
|
||||||
|
DEC_CROSS_ATTN_Q = auto()
|
||||||
|
DEC_CROSS_ATTN_K = auto()
|
||||||
|
DEC_CROSS_ATTN_V = auto()
|
||||||
|
DEC_CROSS_ATTN_OUT = auto()
|
||||||
|
DEC_CROSS_ATTN_REL_B = auto()
|
||||||
|
DEC_FFN_NORM = auto()
|
||||||
|
DEC_FFN_GATE = auto()
|
||||||
|
DEC_FFN_DOWN = auto()
|
||||||
|
DEC_FFN_UP = auto()
|
||||||
|
DEC_OUTPUT_NORM = auto()
|
||||||
|
ENC_ATTN_NORM = auto()
|
||||||
|
ENC_ATTN_Q = auto()
|
||||||
|
ENC_ATTN_K = auto()
|
||||||
|
ENC_ATTN_V = auto()
|
||||||
|
ENC_ATTN_OUT = auto()
|
||||||
|
ENC_ATTN_REL_B = auto()
|
||||||
|
ENC_FFN_NORM = auto()
|
||||||
|
ENC_FFN_GATE = auto()
|
||||||
|
ENC_FFN_DOWN = auto()
|
||||||
|
ENC_FFN_UP = auto()
|
||||||
|
ENC_OUTPUT_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -237,57 +278,89 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.OLMO: "olmo",
|
MODEL_ARCH.OLMO: "olmo",
|
||||||
MODEL_ARCH.ARCTIC: "arctic",
|
MODEL_ARCH.ARCTIC: "arctic",
|
||||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
|
MODEL_ARCH.BITNET: "bitnet",
|
||||||
|
MODEL_ARCH.T5: "t5",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||||
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||||
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -808,6 +881,53 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BITNET: [
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM,
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.T5: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP,
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM,
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP,
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ import struct
|
||||||
import tempfile
|
import tempfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
|
from pathlib import Path
|
||||||
from io import BufferedWriter
|
from io import BufferedWriter
|
||||||
from typing import IO, Any, Sequence, Mapping
|
from typing import IO, Any, Sequence, Mapping
|
||||||
from string import ascii_letters, digits
|
from string import ascii_letters, digits
|
||||||
|
@ -31,6 +32,9 @@ from .quants import quant_shape_from_byte_shape
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TensorInfo:
|
class TensorInfo:
|
||||||
shape: Sequence[int]
|
shape: Sequence[int]
|
||||||
|
@ -55,11 +59,11 @@ class WriterState(Enum):
|
||||||
|
|
||||||
|
|
||||||
class GGUFWriter:
|
class GGUFWriter:
|
||||||
fout: BufferedWriter | None
|
fout: list[BufferedWriter] | None
|
||||||
path: os.PathLike[str] | str | None
|
path: Path | None
|
||||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
||||||
tensors: dict[str, TensorInfo]
|
tensors: list[dict[str, TensorInfo]]
|
||||||
kv_data: dict[str, GGUFValue]
|
kv_data: list[dict[str, GGUFValue]]
|
||||||
state: WriterState
|
state: WriterState
|
||||||
_simple_value_packing = {
|
_simple_value_packing = {
|
||||||
GGUFValueType.UINT8: "B",
|
GGUFValueType.UINT8: "B",
|
||||||
|
@ -76,26 +80,38 @@ class GGUFWriter:
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False,
|
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
|
||||||
endianess: GGUFEndian = GGUFEndian.LITTLE,
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
|
||||||
):
|
):
|
||||||
self.fout = None
|
self.fout = None
|
||||||
self.path = path
|
self.path = Path(path) if path else None
|
||||||
self.arch = arch
|
self.arch = arch
|
||||||
self.endianess = endianess
|
self.endianess = endianess
|
||||||
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||||
self.use_temp_file = use_temp_file
|
self.use_temp_file = use_temp_file
|
||||||
self.temp_file = None
|
self.temp_file = None
|
||||||
self.tensors = dict()
|
self.tensors = [{}]
|
||||||
self.kv_data = dict()
|
self.kv_data = [{}]
|
||||||
|
self.split_max_tensors = split_max_tensors
|
||||||
|
self.split_max_size = split_max_size
|
||||||
|
self.dry_run = dry_run
|
||||||
|
self.small_first_shard = small_first_shard
|
||||||
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
||||||
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
||||||
))
|
))
|
||||||
self.state = WriterState.NO_FILE
|
self.state = WriterState.NO_FILE
|
||||||
|
|
||||||
|
if self.small_first_shard:
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
self.add_architecture()
|
self.add_architecture()
|
||||||
|
|
||||||
def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None:
|
def format_shard_names(self, path: Path) -> list[Path]:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return [path]
|
||||||
|
return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
|
||||||
|
|
||||||
|
def open_output_file(self, path: Path | None = None) -> None:
|
||||||
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
||||||
# allow calling this multiple times as long as the path is the same
|
# allow calling this multiple times as long as the path is the same
|
||||||
return
|
return
|
||||||
|
@ -106,22 +122,58 @@ class GGUFWriter:
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
if self.path is not None:
|
if self.path is not None:
|
||||||
if self.fout is not None:
|
filenames = self.print_plan()
|
||||||
self.fout.close()
|
self.fout = [open(filename, "wb") for filename in filenames]
|
||||||
self.fout = open(self.path, "wb")
|
|
||||||
self.state = WriterState.EMPTY
|
self.state = WriterState.EMPTY
|
||||||
|
|
||||||
def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None:
|
def print_plan(self) -> list[Path]:
|
||||||
|
logger.info("Writing the following files:")
|
||||||
|
assert self.path is not None
|
||||||
|
filenames = self.format_shard_names(self.path)
|
||||||
|
assert len(filenames) == len(self.tensors)
|
||||||
|
for name, tensors in zip(filenames, self.tensors):
|
||||||
|
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
|
||||||
|
|
||||||
|
if self.dry_run:
|
||||||
|
logger.info("Dry run, not writing files")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
return filenames
|
||||||
|
|
||||||
|
def add_shard_kv_data(self) -> None:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
total_tensors = sum(len(t) for t in self.tensors)
|
||||||
|
assert self.fout is not None
|
||||||
|
total_splits = len(self.fout)
|
||||||
|
self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
|
||||||
|
for i, kv_data in enumerate(self.kv_data):
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
|
||||||
|
|
||||||
|
def write_header_to_file(self, path: Path | None = None) -> None:
|
||||||
|
if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
|
||||||
|
logger.warning("Model fails split requirements, not splitting")
|
||||||
|
|
||||||
self.open_output_file(path)
|
self.open_output_file(path)
|
||||||
|
|
||||||
if self.state is not WriterState.EMPTY:
|
if self.state is not WriterState.EMPTY:
|
||||||
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
||||||
|
|
||||||
self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
|
assert self.fout is not None
|
||||||
self._write_packed("I", GGUF_VERSION)
|
assert len(self.fout) == len(self.tensors)
|
||||||
self._write_packed("Q", len(self.tensors))
|
assert len(self.kv_data) == 1
|
||||||
self._write_packed("Q", len(self.kv_data))
|
|
||||||
self.flush()
|
self.add_shard_kv_data()
|
||||||
|
|
||||||
|
for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
|
||||||
|
fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
|
||||||
|
fout.write(self._pack("I", GGUF_VERSION))
|
||||||
|
fout.write(self._pack("Q", len(tensors)))
|
||||||
|
fout.write(self._pack("Q", len(kv_data)))
|
||||||
|
fout.flush()
|
||||||
self.state = WriterState.HEADER
|
self.state = WriterState.HEADER
|
||||||
|
|
||||||
def write_kv_data_to_file(self) -> None:
|
def write_kv_data_to_file(self) -> None:
|
||||||
|
@ -129,13 +181,15 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
kv_data = bytearray()
|
for fout, kv_data in zip(self.fout, self.kv_data):
|
||||||
|
kv_bytes = bytearray()
|
||||||
|
|
||||||
for key, val in self.kv_data.items():
|
for key, val in kv_data.items():
|
||||||
kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
||||||
kv_data += self._pack_val(val.value, val.type, add_vtype=True)
|
kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
|
||||||
|
|
||||||
|
fout.write(kv_bytes)
|
||||||
|
|
||||||
self.fout.write(kv_data)
|
|
||||||
self.flush()
|
self.flush()
|
||||||
self.state = WriterState.KV_DATA
|
self.state = WriterState.KV_DATA
|
||||||
|
|
||||||
|
@ -144,28 +198,29 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
ti_data = bytearray()
|
for fout, tensors in zip(self.fout, self.tensors):
|
||||||
offset_tensor = 0
|
ti_data = bytearray()
|
||||||
|
offset_tensor = 0
|
||||||
|
|
||||||
for name, ti in self.tensors.items():
|
for name, ti in tensors.items():
|
||||||
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
||||||
n_dims = len(ti.shape)
|
n_dims = len(ti.shape)
|
||||||
ti_data += self._pack("I", n_dims)
|
ti_data += self._pack("I", n_dims)
|
||||||
for i in range(n_dims):
|
for j in range(n_dims):
|
||||||
ti_data += self._pack("Q", ti.shape[n_dims - 1 - i])
|
ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
|
||||||
ti_data += self._pack("I", ti.dtype)
|
ti_data += self._pack("I", ti.dtype)
|
||||||
ti_data += self._pack("Q", offset_tensor)
|
ti_data += self._pack("Q", offset_tensor)
|
||||||
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
||||||
|
|
||||||
self.fout.write(ti_data)
|
fout.write(ti_data)
|
||||||
self.flush()
|
fout.flush()
|
||||||
self.state = WriterState.TI_DATA
|
self.state = WriterState.TI_DATA
|
||||||
|
|
||||||
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
||||||
if key in self.kv_data:
|
if any(key in kv_data for kv_data in self.kv_data):
|
||||||
raise ValueError(f'Duplicated key name {key!r}')
|
raise ValueError(f'Duplicated key name {key!r}')
|
||||||
|
|
||||||
self.kv_data[key] = GGUFValue(value=val, type=vtype)
|
self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
|
||||||
|
|
||||||
def add_uint8(self, key: str, val: int) -> None:
|
def add_uint8(self, key: str, val: int) -> None:
|
||||||
self.add_key_value(key,val, GGUFValueType.UINT8)
|
self.add_key_value(key,val, GGUFValueType.UINT8)
|
||||||
|
@ -206,9 +261,6 @@ class GGUFWriter:
|
||||||
self.add_key_value(key, val, GGUFValueType.STRING)
|
self.add_key_value(key, val, GGUFValueType.STRING)
|
||||||
|
|
||||||
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
||||||
if not isinstance(val, Sequence):
|
|
||||||
raise ValueError("Value must be a sequence for array type")
|
|
||||||
|
|
||||||
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -222,7 +274,7 @@ class GGUFWriter:
|
||||||
if self.state is not WriterState.NO_FILE:
|
if self.state is not WriterState.NO_FILE:
|
||||||
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
||||||
|
|
||||||
if name in self.tensors:
|
if any(name in tensors for tensors in self.tensors):
|
||||||
raise ValueError(f'Duplicated tensor name {name!r}')
|
raise ValueError(f'Duplicated tensor name {name!r}')
|
||||||
|
|
||||||
if raw_dtype is None:
|
if raw_dtype is None:
|
||||||
|
@ -247,7 +299,18 @@ class GGUFWriter:
|
||||||
if tensor_dtype == np.uint8:
|
if tensor_dtype == np.uint8:
|
||||||
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
|
|
||||||
self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
# make sure there is at least one tensor before splitting
|
||||||
|
if len(self.tensors[-1]) > 0:
|
||||||
|
if ( # split when over tensor limit
|
||||||
|
self.split_max_tensors != 0
|
||||||
|
and len(self.tensors[-1]) >= self.split_max_tensors
|
||||||
|
) or ( # split when over size limit
|
||||||
|
self.split_max_size != 0
|
||||||
|
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
|
||||||
|
):
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
|
self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
||||||
|
|
||||||
def add_tensor(
|
def add_tensor(
|
||||||
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
||||||
|
@ -264,7 +327,7 @@ class GGUFWriter:
|
||||||
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
self.tensors[name].tensor = tensor
|
self.tensors[-1][name].tensor = tensor
|
||||||
return
|
return
|
||||||
|
|
||||||
tensor.tofile(self.temp_file)
|
tensor.tofile(self.temp_file)
|
||||||
|
@ -282,9 +345,24 @@ class GGUFWriter:
|
||||||
|
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
tensor.byteswap(inplace=True)
|
tensor.byteswap(inplace=True)
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
|
||||||
tensor.tofile(self.fout)
|
file_id = -1
|
||||||
self.write_padding(self.fout, tensor.nbytes)
|
for i, tensors in enumerate(self.tensors):
|
||||||
|
if len(tensors) > 0:
|
||||||
|
file_id = i
|
||||||
|
break
|
||||||
|
|
||||||
|
fout = self.fout[file_id]
|
||||||
|
|
||||||
|
# pop the first tensor info
|
||||||
|
# TODO: cleaner way to get the first key
|
||||||
|
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
|
||||||
|
ti = self.tensors[file_id].pop(first_tensor_name)
|
||||||
|
assert ti.nbytes == tensor.nbytes
|
||||||
|
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
tensor.tofile(fout)
|
||||||
|
self.write_padding(fout, tensor.nbytes)
|
||||||
|
|
||||||
self.state = WriterState.WEIGHTS
|
self.state = WriterState.WEIGHTS
|
||||||
|
|
||||||
|
@ -293,31 +371,43 @@ class GGUFWriter:
|
||||||
|
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
for fout in self.fout:
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
|
shard_bar = None
|
||||||
bar = None
|
bar = None
|
||||||
|
|
||||||
if progress:
|
if progress:
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
total_bytes = sum(t.nbytes for t in self.tensors.values())
|
total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
|
||||||
|
|
||||||
|
if len(self.fout) > 1:
|
||||||
|
shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
|
||||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
|
||||||
for ti in self.tensors.values():
|
if shard_bar is not None:
|
||||||
assert ti.tensor is not None # can only iterate once over the tensors
|
shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
|
||||||
assert ti.tensor.nbytes == ti.nbytes
|
total = sum(ti.nbytes for ti in tensors.values())
|
||||||
ti.tensor.tofile(self.fout)
|
shard_bar.reset(total=(total if total > 0 else None))
|
||||||
if bar is not None:
|
|
||||||
bar.update(ti.nbytes)
|
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
||||||
self.write_padding(self.fout, ti.nbytes)
|
for ti in tensors.values():
|
||||||
ti.tensor = None
|
assert ti.tensor is not None # can only iterate once over the tensors
|
||||||
|
assert ti.tensor.nbytes == ti.nbytes
|
||||||
|
ti.tensor.tofile(fout)
|
||||||
|
if shard_bar is not None:
|
||||||
|
shard_bar.update(ti.nbytes)
|
||||||
|
if bar is not None:
|
||||||
|
bar.update(ti.nbytes)
|
||||||
|
self.write_padding(fout, ti.nbytes)
|
||||||
|
ti.tensor = None
|
||||||
else:
|
else:
|
||||||
self.temp_file.seek(0)
|
self.temp_file.seek(0)
|
||||||
|
|
||||||
shutil.copyfileobj(self.temp_file, self.fout)
|
shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
|
||||||
self.flush()
|
self.flush()
|
||||||
self.temp_file.close()
|
self.temp_file.close()
|
||||||
|
|
||||||
|
@ -325,11 +415,13 @@ class GGUFWriter:
|
||||||
|
|
||||||
def flush(self) -> None:
|
def flush(self) -> None:
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
self.fout.flush()
|
for fout in self.fout:
|
||||||
|
fout.flush()
|
||||||
|
|
||||||
def close(self) -> None:
|
def close(self) -> None:
|
||||||
if self.fout is not None:
|
if self.fout is not None:
|
||||||
self.fout.close()
|
for fout in self.fout:
|
||||||
|
fout.close()
|
||||||
self.fout = None
|
self.fout = None
|
||||||
|
|
||||||
def add_architecture(self) -> None:
|
def add_architecture(self) -> None:
|
||||||
|
@ -400,6 +492,9 @@ class GGUFWriter:
|
||||||
def add_parallel_residual(self, use: bool) -> None:
|
def add_parallel_residual(self, use: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
|
def add_decoder_start_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
|
||||||
|
|
||||||
def add_head_count(self, count: int) -> None:
|
def add_head_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
@ -448,6 +543,9 @@ class GGUFWriter:
|
||||||
def add_kv_lora_rank(self, length: int) -> None:
|
def add_kv_lora_rank(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
@ -538,6 +636,12 @@ class GGUFWriter:
|
||||||
def add_add_space_prefix(self, value: bool) -> None:
|
def add_add_space_prefix(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||||
|
|
||||||
|
def add_remove_extra_whitespaces(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
|
||||||
|
|
||||||
|
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
|
||||||
|
|
||||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
if not isinstance(value, str):
|
if not isinstance(value, str):
|
||||||
template_default = None
|
template_default = None
|
||||||
|
@ -599,9 +703,12 @@ class GGUFWriter:
|
||||||
kv_data += self._pack("Q", len(encoded_val))
|
kv_data += self._pack("Q", len(encoded_val))
|
||||||
kv_data += encoded_val
|
kv_data += encoded_val
|
||||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
||||||
ltype = GGUFValueType.get_type(val[0])
|
if isinstance(val, bytes):
|
||||||
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
ltype = GGUFValueType.UINT8
|
||||||
raise ValueError("All items in a GGUF array should be of the same type")
|
else:
|
||||||
|
ltype = GGUFValueType.get_type(val[0])
|
||||||
|
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
||||||
|
raise ValueError("All items in a GGUF array should be of the same type")
|
||||||
kv_data += self._pack("I", ltype)
|
kv_data += self._pack("I", ltype)
|
||||||
kv_data += self._pack("Q", len(val))
|
kv_data += self._pack("Q", len(val))
|
||||||
for item in val:
|
for item in val:
|
||||||
|
@ -611,6 +718,13 @@ class GGUFWriter:
|
||||||
|
|
||||||
return kv_data
|
return kv_data
|
||||||
|
|
||||||
def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
|
@staticmethod
|
||||||
assert self.fout is not None
|
def format_n_bytes_to_str(num: int) -> str:
|
||||||
self.fout.write(self._pack(fmt, value, skip_pack_prefix))
|
if num == 0:
|
||||||
|
return "negligible - metadata only"
|
||||||
|
fnum = float(num)
|
||||||
|
for unit in ("", "K", "M", "G"):
|
||||||
|
if abs(fnum) < 1000.0:
|
||||||
|
return f"{fnum:3.1f}{unit}"
|
||||||
|
fnum /= 1000.0
|
||||||
|
return f"{fnum:.1f}T - over 1TB, split recommended"
|
||||||
|
|
|
@ -24,6 +24,7 @@ class TensorNameMap:
|
||||||
"backbone.embedding", # mamba
|
"backbone.embedding", # mamba
|
||||||
"backbone.embeddings", # mamba-hf
|
"backbone.embeddings", # mamba-hf
|
||||||
"transformer.in_out_embed", # Grok
|
"transformer.in_out_embed", # Grok
|
||||||
|
"shared", # t5
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
@ -413,6 +414,128 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||||
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.2.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: (
|
||||||
|
"decoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||||
|
"encoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# architecture-specific block mappings
|
# architecture-specific block mappings
|
||||||
|
|
|
@ -208,7 +208,9 @@ def translate_tensor_name(name):
|
||||||
'ssm_d': 'State space model skip connection',
|
'ssm_d': 'State space model skip connection',
|
||||||
'ssm_dt': 'State space model time step',
|
'ssm_dt': 'State space model time step',
|
||||||
'ssm_out': 'State space model output projection',
|
'ssm_out': 'State space model output projection',
|
||||||
'blk': 'Block'
|
'blk': 'Block',
|
||||||
|
'enc': 'Encoder',
|
||||||
|
'dec': 'Decoder',
|
||||||
}
|
}
|
||||||
|
|
||||||
expanded_words = []
|
expanded_words = []
|
||||||
|
@ -291,6 +293,10 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
|
||||||
tensor_group_name = "base"
|
tensor_group_name = "base"
|
||||||
if tensor_components[0] == 'blk':
|
if tensor_components[0] == 'blk':
|
||||||
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
|
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
|
||||||
|
elif tensor_components[0] in ['enc', 'dec'] and tensor_components[1] == 'blk':
|
||||||
|
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}.{tensor_components[2]}"
|
||||||
|
elif tensor_components[0] in ['enc', 'dec']:
|
||||||
|
tensor_group_name = f"{tensor_components[0]}"
|
||||||
|
|
||||||
# Check if new Tensor Group
|
# Check if new Tensor Group
|
||||||
if tensor_group_name not in tensor_groups:
|
if tensor_group_name not in tensor_groups:
|
||||||
|
|
412
llama.cpp
412
llama.cpp
|
@ -225,6 +225,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_OLMO,
|
LLM_ARCH_OLMO,
|
||||||
LLM_ARCH_ARCTIC,
|
LLM_ARCH_ARCTIC,
|
||||||
LLM_ARCH_DEEPSEEK2,
|
LLM_ARCH_DEEPSEEK2,
|
||||||
|
LLM_ARCH_BITNET,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -263,6 +264,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_OLMO, "olmo" },
|
{ LLM_ARCH_OLMO, "olmo" },
|
||||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||||
|
{ LLM_ARCH_BITNET, "bitnet" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -500,6 +502,8 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_KV_B,
|
LLM_TENSOR_ATTN_KV_B,
|
||||||
LLM_TENSOR_ATTN_Q_A_NORM,
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
||||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||||
|
LLM_TENSOR_ATTN_SUB_NORM,
|
||||||
|
LLM_TENSOR_FFN_SUB_NORM,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||||
|
@ -1113,6 +1117,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_BITNET,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -2118,6 +2140,8 @@ struct llama_layer {
|
||||||
struct ggml_tensor * attn_out_norm_b;
|
struct ggml_tensor * attn_out_norm_b;
|
||||||
struct ggml_tensor * attn_q_a_norm;
|
struct ggml_tensor * attn_q_a_norm;
|
||||||
struct ggml_tensor * attn_kv_a_norm;
|
struct ggml_tensor * attn_kv_a_norm;
|
||||||
|
struct ggml_tensor * attn_sub_norm;
|
||||||
|
struct ggml_tensor * ffn_sub_norm;
|
||||||
|
|
||||||
// attention
|
// attention
|
||||||
struct ggml_tensor * wq;
|
struct ggml_tensor * wq;
|
||||||
|
@ -2185,6 +2209,15 @@ struct llama_layer {
|
||||||
// long rope factors
|
// long rope factors
|
||||||
struct ggml_tensor * rope_long = nullptr;
|
struct ggml_tensor * rope_long = nullptr;
|
||||||
struct ggml_tensor * rope_short = nullptr;
|
struct ggml_tensor * rope_short = nullptr;
|
||||||
|
|
||||||
|
// bitnet scale
|
||||||
|
struct ggml_tensor * wq_scale;
|
||||||
|
struct ggml_tensor * wk_scale;
|
||||||
|
struct ggml_tensor * wv_scale;
|
||||||
|
struct ggml_tensor * wo_scale;
|
||||||
|
struct ggml_tensor * ffn_gate_scale;
|
||||||
|
struct ggml_tensor * ffn_up_scale;
|
||||||
|
struct ggml_tensor * ffn_down_scale;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cell {
|
struct llama_kv_cell {
|
||||||
|
@ -2293,6 +2326,8 @@ struct llama_vocab {
|
||||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
||||||
|
int max_token_len = 0; // used for optimizing longest token search
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
std::unordered_map<token, id> token_to_id;
|
||||||
std::vector<token_data> id_to_token;
|
std::vector<token_data> id_to_token;
|
||||||
|
|
||||||
|
@ -4708,6 +4743,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4939,6 +4983,7 @@ static void llm_load_vocab(
|
||||||
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
|
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
||||||
|
|
||||||
auto & token_data = vocab.id_to_token[i];
|
auto & token_data = vocab.id_to_token[i];
|
||||||
token_data.text = std::move(word);
|
token_data.text = std::move(word);
|
||||||
|
@ -5249,6 +5294,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
||||||
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||||
|
@ -6650,6 +6697,44 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
}
|
}
|
||||||
|
@ -7290,7 +7375,10 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
|
|
||||||
ggml_build_forward_expand(graph, cur);
|
ggml_build_forward_expand(graph, cur);
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx, wo, cur);
|
if (wo) {
|
||||||
|
cur = ggml_mul_mat(ctx, wo, cur);
|
||||||
|
}
|
||||||
|
|
||||||
if (wo_b) {
|
if (wo_b) {
|
||||||
cb(cur, "kqv_wo", il);
|
cb(cur, "kqv_wo", il);
|
||||||
}
|
}
|
||||||
|
@ -7649,6 +7737,50 @@ struct llm_build_context {
|
||||||
return lctx.inp_s_seq;
|
return lctx.inp_s_seq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
||||||
|
// find result_norm tensor for input
|
||||||
|
struct ggml_tensor * inp = nullptr;
|
||||||
|
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||||
|
inp = gf->nodes[i];
|
||||||
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
inp = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
|
||||||
|
switch (pooling_type) {
|
||||||
|
case LLAMA_POOLING_TYPE_MEAN:
|
||||||
|
{
|
||||||
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
||||||
|
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
|
||||||
|
} break;
|
||||||
|
case LLAMA_POOLING_TYPE_CLS:
|
||||||
|
case LLAMA_POOLING_TYPE_LAST:
|
||||||
|
{
|
||||||
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
||||||
|
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||||
|
} break;
|
||||||
|
case LLAMA_POOLING_TYPE_NONE:
|
||||||
|
{
|
||||||
|
cur = inp;
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false && "unknown pooling type");
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(cur, "result_embd_pooled", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_llama() {
|
struct ggml_cgraph * build_llama() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -8629,8 +8761,6 @@ struct llm_build_context {
|
||||||
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
||||||
inp_pos = build_inp_pos();
|
inp_pos = build_inp_pos();
|
||||||
}
|
}
|
||||||
struct ggml_tensor * inp_mean = build_inp_mean();
|
|
||||||
struct ggml_tensor * inp_cls = build_inp_cls();
|
|
||||||
|
|
||||||
// construct input embeddings (token, type, position)
|
// construct input embeddings (token, type, position)
|
||||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
@ -8805,28 +8935,6 @@ struct llm_build_context {
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
cb(cur, "result_embd", -1);
|
cb(cur, "result_embd", -1);
|
||||||
|
|
||||||
// pooling layer
|
|
||||||
switch (pooling_type) {
|
|
||||||
case LLAMA_POOLING_TYPE_NONE:
|
|
||||||
{
|
|
||||||
// nop
|
|
||||||
} break;
|
|
||||||
case LLAMA_POOLING_TYPE_MEAN:
|
|
||||||
{
|
|
||||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
|
||||||
cb(cur, "result_embd_pooled", -1);
|
|
||||||
} break;
|
|
||||||
case LLAMA_POOLING_TYPE_CLS:
|
|
||||||
{
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
|
||||||
cb(cur, "result_embd_pooled", -1);
|
|
||||||
} break;
|
|
||||||
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(false && "Invalid pooling type");
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
|
@ -11684,6 +11792,153 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_bitnet() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// B1.K
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// B1.V
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||||
|
nullptr, nullptr,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].attn_sub_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_sub_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||||
|
if (model.layers[il].bo) {
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||||
|
}
|
||||||
|
cb(cur, "attn_o_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward forward
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
||||||
|
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
|
||||||
|
cb(tmp, "ffn_up", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
|
||||||
|
cb(cur, "ffn_gate", il);
|
||||||
|
|
||||||
|
cur = ggml_silu(ctx0, cur);
|
||||||
|
cb(cur, "ffn_silu", il);
|
||||||
|
|
||||||
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
|
cb(cur, "ffn_gate_par", il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].ffn_sub_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_sub_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||||
|
cb(cur, "ffn_down", il);
|
||||||
|
}
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||||
|
@ -11907,10 +12162,19 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_deepseek2();
|
result = llm.build_deepseek2();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
result = llm.build_bitnet();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add on pooling layer
|
||||||
|
if (lctx.cparams.embeddings) {
|
||||||
|
result = llm.append_pooling(result);
|
||||||
|
}
|
||||||
|
|
||||||
llm.free();
|
llm.free();
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -12000,7 +12264,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
// (!a || b) is a logical implication (a -> b)
|
// (!a || b) is a logical implication (a -> b)
|
||||||
// !hparams.causal_attn -> !cparams.causal_attn
|
// !hparams.causal_attn -> !cparams.causal_attn
|
||||||
(hparams.causal_attn || !cparams.causal_attn) &&
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
||||||
"causal attention with embedding models is not supported"
|
"causal attention is not supported by this model"
|
||||||
);
|
);
|
||||||
|
|
||||||
if (lctx.inp_KQ_mask) {
|
if (lctx.inp_KQ_mask) {
|
||||||
|
@ -12132,6 +12396,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
||||||
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
GGML_ASSERT(lctx.inp_cls);
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||||
|
|
||||||
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
||||||
|
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
||||||
|
|
||||||
|
std::vector<int> last_pos(n_tokens, -1);
|
||||||
|
std::vector<int> last_row(n_tokens, -1);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
|
const llama_pos pos = batch.pos[i];
|
||||||
|
|
||||||
|
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
|
||||||
|
|
||||||
|
if (pos >= last_pos[seq_id]) {
|
||||||
|
last_pos[seq_id] = pos;
|
||||||
|
last_row[seq_id] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
if (last_row[i] >= 0) {
|
||||||
|
data[i] = last_row[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (kv_self.recurrent) {
|
if (kv_self.recurrent) {
|
||||||
const int64_t n_kv = kv_self.n;
|
const int64_t n_kv = kv_self.n;
|
||||||
|
|
||||||
|
@ -12193,8 +12488,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
|
|
||||||
// TODO: use a per-batch flag for logits presence instead
|
// TODO: use a per-batch flag for logits presence instead
|
||||||
const bool has_logits = cparams.causal_attn;
|
const bool has_logits = !cparams.embeddings;
|
||||||
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||||
|
|
||||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||||
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
||||||
|
@ -12324,11 +12619,13 @@ static int llama_decode_internal(
|
||||||
std::vector<std::vector<llama_seq_id>> seq_id;
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
||||||
|
|
||||||
// count outputs
|
// count outputs
|
||||||
if (batch_all.logits) {
|
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
n_outputs = n_tokens_all;
|
||||||
|
} else if (batch_all.logits) {
|
||||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||||
n_outputs += batch_all.logits[i] != 0;
|
n_outputs += batch_all.logits[i] != 0;
|
||||||
}
|
}
|
||||||
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
} else if (lctx.logits_all) {
|
||||||
n_outputs = n_tokens_all;
|
n_outputs = n_tokens_all;
|
||||||
} else {
|
} else {
|
||||||
// keep last output only
|
// keep last output only
|
||||||
|
@ -12459,30 +12756,13 @@ static int llama_decode_internal(
|
||||||
// no output
|
// no output
|
||||||
res = nullptr;
|
res = nullptr;
|
||||||
embd = nullptr;
|
embd = nullptr;
|
||||||
} else if (!hparams.causal_attn) {
|
|
||||||
res = nullptr; // do not extract logits for embedding models such as BERT
|
|
||||||
|
|
||||||
// token or sequence embeddings
|
|
||||||
embd = gf->nodes[gf->n_nodes - 1];
|
|
||||||
|
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
|
||||||
} else if (cparams.embeddings) {
|
} else if (cparams.embeddings) {
|
||||||
// the embeddings could be in the second to last tensor, or any of the previous tensors
|
res = nullptr; // do not extract logits for embedding case
|
||||||
int i_embd = gf->n_nodes - 2;
|
embd = gf->nodes[gf->n_nodes - 1];
|
||||||
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
|
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||||
i_embd = gf->n_nodes - i;
|
embd = gf->nodes[gf->n_nodes - 2];
|
||||||
if (i_embd < 0) { break; }
|
|
||||||
embd = gf->nodes[i_embd];
|
|
||||||
}
|
|
||||||
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
|
||||||
|
|
||||||
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
|
||||||
if (!cparams.causal_attn) {
|
|
||||||
res = nullptr; // do not extract logits when not needed
|
|
||||||
// skip computing logits
|
|
||||||
// TODO: is this safe?
|
|
||||||
gf->n_nodes = i_embd + 1;
|
|
||||||
}
|
}
|
||||||
|
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||||
} else {
|
} else {
|
||||||
embd = nullptr; // do not extract embeddings when not needed
|
embd = nullptr; // do not extract embeddings when not needed
|
||||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
||||||
|
@ -12505,12 +12785,6 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
|
||||||
// print timing information per ggml operation (for debugging purposes)
|
|
||||||
// requires GGML_PERF to be defined
|
|
||||||
ggml_graph_print(gf);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// plot the computation graph in dot format (for debugging purposes)
|
// plot the computation graph in dot format (for debugging purposes)
|
||||||
//if (n_past%100 == 0) {
|
//if (n_past%100 == 0) {
|
||||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||||
|
@ -12551,11 +12825,10 @@ static int llama_decode_internal(
|
||||||
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_POOLING_TYPE_CLS:
|
|
||||||
case LLAMA_POOLING_TYPE_MEAN:
|
case LLAMA_POOLING_TYPE_MEAN:
|
||||||
|
case LLAMA_POOLING_TYPE_CLS:
|
||||||
|
case LLAMA_POOLING_TYPE_LAST:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
|
||||||
|
|
||||||
// extract sequence embeddings
|
// extract sequence embeddings
|
||||||
auto & embd_seq_out = lctx.embd_seq;
|
auto & embd_seq_out = lctx.embd_seq;
|
||||||
embd_seq_out.clear();
|
embd_seq_out.clear();
|
||||||
|
@ -13448,7 +13721,7 @@ private:
|
||||||
struct llm_tokenizer_wpm {
|
struct llm_tokenizer_wpm {
|
||||||
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
|
||||||
const auto & token_map = vocab.token_to_id;
|
const auto & token_map = vocab.token_to_id;
|
||||||
|
|
||||||
// normalize and split by whitespace
|
// normalize and split by whitespace
|
||||||
|
@ -13457,7 +13730,7 @@ struct llm_tokenizer_wpm {
|
||||||
// bos token prepended already
|
// bos token prepended already
|
||||||
|
|
||||||
// find the longest tokens that form the words
|
// find the longest tokens that form the words
|
||||||
for (const std::string &word : words) {
|
for (const std::string & word : words) {
|
||||||
// skip empty words
|
// skip empty words
|
||||||
if (word.size() == 0) {
|
if (word.size() == 0) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -13474,7 +13747,7 @@ struct llm_tokenizer_wpm {
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
// loop through possible match length
|
// loop through possible match length
|
||||||
bool match = false;
|
bool match = false;
|
||||||
for (int j = n; j > i; j--) {
|
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
|
||||||
auto it = token_map.find(word1.substr(i, j - i));
|
auto it = token_map.find(word1.substr(i, j - i));
|
||||||
if (it != token_map.end()) {
|
if (it != token_map.end()) {
|
||||||
output.push_back(it->second);
|
output.push_back(it->second);
|
||||||
|
@ -13497,7 +13770,8 @@ struct llm_tokenizer_wpm {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> preprocess(const std::string & text) {
|
// TODO: reduce string copies by using cpts_offs array
|
||||||
|
std::vector<std::string> preprocess(const std::string & text) const {
|
||||||
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
||||||
std::vector<std::string> words(1, "");
|
std::vector<std::string> words(1, "");
|
||||||
|
|
||||||
|
@ -13792,6 +14066,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
output.push_back(vocab.special_cls_id);
|
output.push_back(vocab.special_cls_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llm_tokenizer_wpm tokenizer(vocab);
|
||||||
|
|
||||||
for (const auto & fragment : fragment_buffer) {
|
for (const auto & fragment : fragment_buffer) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
@ -13799,7 +14075,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
||||||
#endif
|
#endif
|
||||||
llm_tokenizer_wpm tokenizer(vocab);
|
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
|
@ -16713,6 +16988,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
case LLM_ARCH_STABLELM:
|
case LLM_ARCH_STABLELM:
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
case LLM_ARCH_QWEN:
|
case LLM_ARCH_QWEN:
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
case LLM_ARCH_QWEN2MOE:
|
case LLM_ARCH_QWEN2MOE:
|
||||||
|
@ -18112,6 +18388,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
|
||||||
ctx->abort_callback_data = abort_callback_data;
|
ctx->abort_callback_data = abort_callback_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
||||||
|
ctx->cparams.embeddings = embeddings;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||||
ctx->cparams.causal_attn = causal_attn;
|
ctx->cparams.causal_attn = causal_attn;
|
||||||
}
|
}
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -174,6 +174,7 @@ extern "C" {
|
||||||
LLAMA_POOLING_TYPE_NONE = 0,
|
LLAMA_POOLING_TYPE_NONE = 0,
|
||||||
LLAMA_POOLING_TYPE_MEAN = 1,
|
LLAMA_POOLING_TYPE_MEAN = 1,
|
||||||
LLAMA_POOLING_TYPE_CLS = 2,
|
LLAMA_POOLING_TYPE_CLS = 2,
|
||||||
|
LLAMA_POOLING_TYPE_LAST = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
|
@ -293,7 +294,6 @@ extern "C" {
|
||||||
|
|
||||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||||
// (ignored if no pooling layer)
|
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
|
@ -786,6 +786,10 @@ extern "C" {
|
||||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Set whether the model is in embeddings mode or not
|
||||||
|
// If true, embeddings will be returned but logits will not
|
||||||
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||||
|
|
||||||
// Set whether to use causal attention or not
|
// Set whether to use causal attention or not
|
||||||
// If set to true, the model will only attend to the past tokens
|
// If set to true, the model will only attend to the past tokens
|
||||||
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
-r ./requirements-convert-legacy-llama.txt
|
-r ./requirements-convert-legacy-llama.txt
|
||||||
torch~=2.1.1
|
torch~=2.2.1
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
-r ./requirements-convert-legacy-llama.txt
|
-r ./requirements-convert-legacy-llama.txt
|
||||||
torch~=2.1.1
|
torch~=2.2.1
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
numpy~=1.24.4
|
numpy~=1.26.4
|
||||||
sentencepiece~=0.2.0
|
sentencepiece~=0.2.0
|
||||||
transformers>=4.40.1,<5.0.0
|
transformers>=4.40.1,<5.0.0
|
||||||
gguf>=0.1.0
|
gguf>=0.1.0
|
||||||
|
|
43
sgemm.cpp
43
sgemm.cpp
|
@ -249,9 +249,8 @@ class tinyBLAS {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
mnpack(0, m, 0, n);
|
||||||
mnpack(0, m, 0, n);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -458,9 +457,8 @@ class tinyBLAS_Q0_ARM {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
mnpack(0, m, 0, n);
|
||||||
mnpack(0, m, 0, n);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -596,9 +594,8 @@ class tinyBLAS_Q0_AVX {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
mnpack(0, m, 0, n);
|
||||||
mnpack(0, m, 0, n);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -829,7 +826,7 @@ class tinyBLAS_Q0_AVX {
|
||||||
* For example, for single-threaded single-precision GEMM you can say
|
* For example, for single-threaded single-precision GEMM you can say
|
||||||
*
|
*
|
||||||
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
||||||
* 0, 1, GGML_TASK_TYPE_COMPUTE,
|
* 0, 1,
|
||||||
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||||
*
|
*
|
||||||
* @param m is rows in `A` and `C`
|
* @param m is rows in `A` and `C`
|
||||||
|
@ -843,14 +840,13 @@ class tinyBLAS_Q0_AVX {
|
||||||
* @param ldc is row stride of `C`
|
* @param ldc is row stride of `C`
|
||||||
* @param ith is thread id (must be less than `nth`)
|
* @param ith is thread id (must be less than `nth`)
|
||||||
* @param nth is number of threads (must be greater than zero)
|
* @param nth is number of threads (must be greater than zero)
|
||||||
* @param task is GGML task type
|
|
||||||
* @param Atype is GGML data type of `A`
|
* @param Atype is GGML data type of `A`
|
||||||
* @param Btype is GGML data type of `B`
|
* @param Btype is GGML data type of `B`
|
||||||
* @param Ctype is GGML data type of `C`
|
* @param Ctype is GGML data type of `C`
|
||||||
* @return true if this function was able to service the matmul request
|
* @return true if this function was able to service the matmul request
|
||||||
*/
|
*/
|
||||||
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||||
int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
|
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
||||||
|
|
||||||
assert(m >= 0);
|
assert(m >= 0);
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
|
@ -877,7 +873,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__AVX__) || defined(__AVX2__)
|
#elif defined(__AVX__) || defined(__AVX2__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
|
@ -887,7 +883,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
if (n < 4)
|
if (n < 4)
|
||||||
|
@ -899,7 +895,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -917,7 +913,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
|
@ -929,7 +925,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||||
if (n < 8)
|
if (n < 8)
|
||||||
|
@ -943,7 +939,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const ggml_fp16_t *)B, ldb,
|
(const ggml_fp16_t *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
if (k % 4)
|
if (k % 4)
|
||||||
|
@ -955,7 +951,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -971,7 +967,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
||||||
|
@ -979,7 +975,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -995,7 +991,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
||||||
|
@ -1003,7 +999,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -1025,7 +1021,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(void)ldc;
|
(void)ldc;
|
||||||
(void)ith;
|
(void)ith;
|
||||||
(void)nth;
|
(void)nth;
|
||||||
(void)task;
|
|
||||||
(void)Atype;
|
(void)Atype;
|
||||||
(void)Btype;
|
(void)Btype;
|
||||||
(void)Ctype;
|
(void)Ctype;
|
||||||
|
|
2
sgemm.h
2
sgemm.h
|
@ -7,7 +7,7 @@ extern "C" {
|
||||||
|
|
||||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||||
const void *, int64_t, void *, int64_t, int, int,
|
const void *, int64_t, void *, int64_t, int, int,
|
||||||
int, int, int, int);
|
int, int, int);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -785,6 +785,10 @@ struct test_cpy : public test_case {
|
||||||
return VARS_TO_STR3(type_src, type_dst, ne);
|
return VARS_TO_STR3(type_src, type_dst, ne);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double max_nmse_err() override {
|
||||||
|
return 1e-6;
|
||||||
|
}
|
||||||
|
|
||||||
size_t op_size(ggml_tensor * t) override {
|
size_t op_size(ggml_tensor * t) override {
|
||||||
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,11 +7,16 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
#include "json-schema-to-grammar.h"
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
//#define INCLUDE_FAILING_TESTS 1
|
||||||
|
|
||||||
static llama_grammar* build_grammar(const std::string & grammar_str) {
|
static llama_grammar* build_grammar(const std::string & grammar_str) {
|
||||||
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
||||||
|
|
||||||
|
@ -65,8 +70,8 @@ static bool match_string(const std::string & input, llama_grammar* grammar) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
|
static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
|
||||||
fprintf(stderr, "⚫ Testing %s. Grammar: %s\n", test_desc.c_str(), grammar_str.c_str());
|
fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
auto grammar = build_grammar(grammar_str);
|
auto grammar = build_grammar(grammar_str);
|
||||||
|
@ -85,6 +90,23 @@ static void test_grammar(const std::string & test_desc, const std::string & gram
|
||||||
|
|
||||||
if (!matched) {
|
if (!matched) {
|
||||||
fprintf(stderr, "❌ (failed to match)\n");
|
fprintf(stderr, "❌ (failed to match)\n");
|
||||||
|
|
||||||
|
// DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
|
||||||
|
// DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
|
||||||
|
FILE* grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
|
||||||
|
if (grammar_file) {
|
||||||
|
fprintf(grammar_file, "%s", grammar_str.c_str());
|
||||||
|
fclose(grammar_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
// DEBUG: Write the test string to test-grammar-integration.string.txt
|
||||||
|
FILE* string_file = fopen("test-grammar-integration.string.txt", "w");
|
||||||
|
if (string_file) {
|
||||||
|
fprintf(string_file, "%s", test_string.c_str());
|
||||||
|
fclose(string_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "✅︎\n");
|
fprintf(stdout, "✅︎\n");
|
||||||
}
|
}
|
||||||
|
@ -118,6 +140,12 @@ static void test_grammar(const std::string & test_desc, const std::string & gram
|
||||||
// Clean up allocated memory
|
// Clean up allocated memory
|
||||||
llama_grammar_free(grammar);
|
llama_grammar_free(grammar);
|
||||||
}
|
}
|
||||||
|
static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
|
||||||
|
test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
|
||||||
|
}
|
||||||
|
static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
|
||||||
|
test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str)), passing_strings, failing_strings);
|
||||||
|
}
|
||||||
|
|
||||||
static void test_simple_grammar() {
|
static void test_simple_grammar() {
|
||||||
// Test case for a simple grammar
|
// Test case for a simple grammar
|
||||||
|
@ -400,10 +428,11 @@ static void test_quantifiers() {
|
||||||
static void test_failure_missing_root() {
|
static void test_failure_missing_root() {
|
||||||
fprintf(stderr, "⚫ Testing missing root node:\n");
|
fprintf(stderr, "⚫ Testing missing root node:\n");
|
||||||
// Test case for a grammar that is missing a root rule
|
// Test case for a grammar that is missing a root rule
|
||||||
const std::string grammar_str = R"""(rot ::= expr
|
const std::string grammar_str = R"""(
|
||||||
expr ::= term ("+" term)*
|
rot ::= expr
|
||||||
term ::= number
|
expr ::= term ("+" term)*
|
||||||
number ::= [0-9]+)""";
|
term ::= number
|
||||||
|
number ::= [0-9]+)""";
|
||||||
|
|
||||||
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
||||||
|
|
||||||
|
@ -420,10 +449,10 @@ static void test_failure_missing_reference() {
|
||||||
|
|
||||||
// Test case for a grammar that is missing a referenced rule
|
// Test case for a grammar that is missing a referenced rule
|
||||||
const std::string grammar_str =
|
const std::string grammar_str =
|
||||||
R"""(root ::= expr
|
R"""(root ::= expr
|
||||||
expr ::= term ("+" term)*
|
expr ::= term ("+" term)*
|
||||||
term ::= numero
|
term ::= numero
|
||||||
number ::= [0-9]+)""";
|
number ::= [0-9]+)""";
|
||||||
|
|
||||||
fprintf(stderr, " Expected error: ");
|
fprintf(stderr, " Expected error: ");
|
||||||
|
|
||||||
|
@ -445,29 +474,558 @@ static void test_failure_left_recursion() {
|
||||||
|
|
||||||
// Test more complicated left recursion detection
|
// Test more complicated left recursion detection
|
||||||
const std::string medium_str = R"""(
|
const std::string medium_str = R"""(
|
||||||
root ::= asdf
|
root ::= asdf
|
||||||
asdf ::= "a" | asdf "a"
|
asdf ::= "a" | asdf "a"
|
||||||
)""";
|
)""";
|
||||||
assert(test_build_grammar_fails(medium_str));
|
assert(test_build_grammar_fails(medium_str));
|
||||||
|
|
||||||
// Test even more complicated left recursion detection
|
// Test even more complicated left recursion detection
|
||||||
const std::string hard_str = R"""(
|
const std::string hard_str = R"""(
|
||||||
root ::= asdf
|
root ::= asdf
|
||||||
asdf ::= "a" | foo "b"
|
asdf ::= "a" | foo "b"
|
||||||
foo ::= "c" | asdf "d" | "e")""";
|
foo ::= "c" | asdf "d" | "e")""";
|
||||||
assert(test_build_grammar_fails(hard_str));
|
assert(test_build_grammar_fails(hard_str));
|
||||||
|
|
||||||
// Test yet even more complicated left recursion detection
|
// Test yet even more complicated left recursion detection
|
||||||
const std::string hardest_str = R"""(
|
const std::string hardest_str = R"""(
|
||||||
root ::= asdf
|
root ::= asdf
|
||||||
asdf ::= "a" | foo "b"
|
asdf ::= "a" | foo "b"
|
||||||
foo ::= "c" | empty asdf "d" | "e"
|
foo ::= "c" | empty asdf "d" | "e"
|
||||||
empty ::= "blah" | )""";
|
empty ::= "blah" | )""";
|
||||||
assert(test_build_grammar_fails(hardest_str));
|
assert(test_build_grammar_fails(hardest_str));
|
||||||
|
|
||||||
fprintf(stderr, " ✅︎ Passed\n");
|
fprintf(stderr, " ✅︎ Passed\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_json_schema() {
|
||||||
|
// Note that this is similar to the regular grammar tests,
|
||||||
|
// but we convert each json schema to a grammar before parsing.
|
||||||
|
// Otherwise, this test structure is the same.
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"empty schema (object)",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"{}",
|
||||||
|
R"""({"foo": "bar"})""",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"",
|
||||||
|
"[]",
|
||||||
|
"null",
|
||||||
|
"\"\"",
|
||||||
|
"true",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"exotic formats (list)",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"items": [
|
||||||
|
{ "format": "date" },
|
||||||
|
{ "format": "uuid" },
|
||||||
|
{ "format": "time" },
|
||||||
|
{ "format": "date-time" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
// "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
|
||||||
|
// "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
|
||||||
|
R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
|
||||||
|
//R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
|
||||||
|
//R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
R"""(["foo", "bar"])""",
|
||||||
|
R"""(["12345678-1234-1234-1234-1234567890ab"])""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
"\"\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"{}",
|
||||||
|
"\"foo\": \"bar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string w/ min length 1",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"\"",
|
||||||
|
"{}",
|
||||||
|
"\"foo\": \"bar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string w/ min length 3",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 3
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
"\"foobar\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"\"",
|
||||||
|
"\"f\"",
|
||||||
|
"\"fo\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string w/ max length",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"maxLength": 3
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
"\"\"",
|
||||||
|
"\"f\"",
|
||||||
|
"\"fo\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"foobar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string w/ min & max length",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"maxLength": 4
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
"\"f\"",
|
||||||
|
"\"barf\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"\"",
|
||||||
|
"\"barfo\"",
|
||||||
|
"\"foobar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"boolean",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"true",
|
||||||
|
"false",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"\"",
|
||||||
|
"\"true\"",
|
||||||
|
"True",
|
||||||
|
"FALSE",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"integer",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"0",
|
||||||
|
"12345",
|
||||||
|
"1234567890123456"
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"",
|
||||||
|
"01",
|
||||||
|
"007",
|
||||||
|
"12345678901234567"
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string const",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"const": "foo"
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"foo",
|
||||||
|
"\"bar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"non-string const",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"const": true
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"true",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"",
|
||||||
|
"foo",
|
||||||
|
"\"true\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"non-string const",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"enum": ["red", "amber", "green", null, 42, ["foo"]]
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"red\"",
|
||||||
|
"null",
|
||||||
|
"42",
|
||||||
|
"[\"foo\"]",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"",
|
||||||
|
"420",
|
||||||
|
"true",
|
||||||
|
"foo",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"min+max items",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"items": {
|
||||||
|
"type": ["number", "integer"]
|
||||||
|
},
|
||||||
|
"minItems": 3,
|
||||||
|
"maxItems": 5
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"[1, 2, 3]",
|
||||||
|
"[1, 2, 3, 4]",
|
||||||
|
"[1, 2, 3, 4, 5]",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"[1, 2]",
|
||||||
|
"[1, 2, 3, 4, 5, 6]",
|
||||||
|
"1"
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
|
||||||
|
test_schema(
|
||||||
|
"object properties",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"number": { "type": "number" },
|
||||||
|
"street_name": { "type": "string" },
|
||||||
|
"street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
// "By default, leaving out properties is valid"
|
||||||
|
R"""({ "street_name": "Pennsylvania" })""",
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
|
||||||
|
// "By extension, even an empty object is valid"
|
||||||
|
R"""({})""",
|
||||||
|
// "By default, providing additional properties is valid"
|
||||||
|
#ifdef INCLUDE_FAILING_TESTS
|
||||||
|
// TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
|
||||||
|
// TODO: Spaces should be permitted around enum values, but currently they fail to pass.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
|
||||||
|
#endif
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
// Change datatype from number to string
|
||||||
|
R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
// Reorder properties
|
||||||
|
R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
|
||||||
|
// Reorder properties
|
||||||
|
R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
|
||||||
|
test_schema(
|
||||||
|
"object properties, additionalProperties: true",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"number": { "type": "number" },
|
||||||
|
"street_name": { "type": "string" },
|
||||||
|
"street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
|
||||||
|
},
|
||||||
|
"additionalProperties": true
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
// "By extension, even an empty object is valid"
|
||||||
|
R"""({})""",
|
||||||
|
#ifdef INCLUDE_FAILING_TESTS
|
||||||
|
// TODO: Following line should pass and doesn't
|
||||||
|
R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
|
||||||
|
// "By default, leaving out properties is valid"
|
||||||
|
// TODO: Following line should pass and doesn't
|
||||||
|
R"""({ "street_name": "Pennsylvania" })""",
|
||||||
|
// TODO: Following line should pass and doesn't
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
|
||||||
|
// "By default, providing additional properties is valid"
|
||||||
|
// TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
|
||||||
|
// TODO: Spaces should be permitted around enum values, but currently they fail to pass.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
|
||||||
|
#endif
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
// Change datatype from number to string
|
||||||
|
R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
// Reorder properties
|
||||||
|
R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Additional properties: false
|
||||||
|
test_schema(
|
||||||
|
"required + optional props each in original order",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"number": { "type": "number" },
|
||||||
|
"street_name": { "type": "string" },
|
||||||
|
"street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
|
||||||
|
},
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""({ "street_name": "Pennsylvania" })""",
|
||||||
|
R"""({ "number": 1600, "street_type":"Avenue"})""",
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
#ifdef INCLUDE_FAILING_TESTS
|
||||||
|
// TODO: Spaces should be permitted around enum values, but currently they fail to pass.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
|
||||||
|
#endif
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
// Reorder properties
|
||||||
|
R"""({ "street_type": "Avenue", "number": 1600 })""",
|
||||||
|
// Add "direction"
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"required + optional props each in original order",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"properties": {
|
||||||
|
"b": {"type": "string"},
|
||||||
|
"a": {"type": "string"},
|
||||||
|
"d": {"type": "string"},
|
||||||
|
"c": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["a", "b"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""({"b": "foo", "a": "bar"})""",
|
||||||
|
R"""({"b":"foo","a":"bar","d":"qux"})""",
|
||||||
|
R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
R"""({"a": "foo", "b": "bar"})""",
|
||||||
|
R"""({"b": "bar"})""",
|
||||||
|
R"""({"a": "foo", "c": "baz"})""",
|
||||||
|
R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
|
||||||
|
test_schema(
|
||||||
|
"required props",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||||
|
"$id": "https://example.com/product.schema.json",
|
||||||
|
"title": "Product",
|
||||||
|
"description": "A product from Acme's catalog",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"productId": {
|
||||||
|
"description": "The unique identifier for a product",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"productName": {
|
||||||
|
"description": "Name of the product",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"price": {
|
||||||
|
"description": "The price of the product",
|
||||||
|
"type": "number",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"tags": {
|
||||||
|
"description": "Tags for the product",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"minItems": 1,
|
||||||
|
"uniqueItems": true
|
||||||
|
},
|
||||||
|
"dimensions": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"length": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"width": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"height": {
|
||||||
|
"type": "number"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [ "length", "width", "height" ]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [ "productId", "productName", "price" ]
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
R"""({})""", // Missing all required properties
|
||||||
|
R"""({"productName": "A green door", "price": 12.50, "productId": 1})""", // Out of order properties
|
||||||
|
// TODO: The following line should fail, but currently it passes. `exclusiveMinimum` is not supported, as it would likely be too difficult to implement.
|
||||||
|
// Perhaps special checks for minimum and maximum values of 0 could be added (since that's relatively easy to do with grammars), but anything else would likely be too complex.
|
||||||
|
// R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
|
||||||
|
R"""({"productId": 1, "productName": "A green door"})""", // Missing required property (price)
|
||||||
|
R"""({"productName": "A green door", "price": 12.50})""", // Missing required property (productId)
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""", // tags is empty, but minItems is 1
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""", // Tags and dimensions are out of order
|
||||||
|
// TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
|
||||||
|
// R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
fprintf(stdout, "Running grammar integration tests...\n");
|
fprintf(stdout, "Running grammar integration tests...\n");
|
||||||
test_simple_grammar();
|
test_simple_grammar();
|
||||||
|
@ -477,6 +1035,7 @@ int main() {
|
||||||
test_failure_missing_root();
|
test_failure_missing_root();
|
||||||
test_failure_missing_reference();
|
test_failure_missing_reference();
|
||||||
test_failure_left_recursion();
|
test_failure_left_recursion();
|
||||||
|
test_json_schema();
|
||||||
fprintf(stdout, "All tests passed.\n");
|
fprintf(stdout, "All tests passed.\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||||
std::vector<uint32_t> result;
|
std::vector<uint32_t> result;
|
||||||
|
result.reserve(utf8.size());
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
while (offset < utf8.size()) {
|
while (offset < utf8.size()) {
|
||||||
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
||||||
|
|
|
@ -13,7 +13,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue