fix conflicts
This commit is contained in:
commit
95708067c9
55 changed files with 25657 additions and 23589 deletions
|
@ -28,4 +28,5 @@ indent_size = 2
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
[examples/cvector-generator/*.txt]
|
[examples/cvector-generator/*.txt]
|
||||||
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
6
.github/workflows/docker.yml
vendored
6
.github/workflows/docker.yml
vendored
|
@ -33,15 +33,13 @@ jobs:
|
||||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
|
||||||
# have disabled them for now until the reason why
|
|
||||||
# is understood.
|
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
||||||
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
|
|
2
.github/workflows/server.yml
vendored
2
.github/workflows/server.yml
vendored
|
@ -30,7 +30,7 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||||
build_type: [RelWithDebInfo]
|
build_type: [RelWithDebInfo]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
|
|
|
@ -144,9 +144,6 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
|
||||||
option(LLAMA_LASX "llama: enable lasx" ON)
|
option(LLAMA_LASX "llama: enable lasx" ON)
|
||||||
option(LLAMA_LSX "llama: enable lsx" ON)
|
option(LLAMA_LSX "llama: enable lsx" ON)
|
||||||
|
|
||||||
# add perf arguments
|
|
||||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||||
|
|
||||||
|
@ -665,6 +662,7 @@ if (LLAMA_SYCL)
|
||||||
#todo: AOT
|
#todo: AOT
|
||||||
|
|
||||||
find_package(IntelSYCL REQUIRED)
|
find_package(IntelSYCL REQUIRED)
|
||||||
|
find_package(MKL REQUIRED)
|
||||||
|
|
||||||
message(STATUS "SYCL found")
|
message(STATUS "SYCL found")
|
||||||
|
|
||||||
|
@ -679,11 +677,9 @@ if (LLAMA_SYCL)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_compile_options(-I./) #include DPCT
|
add_compile_options(-I./) #include DPCT
|
||||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
|
||||||
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||||
endif()
|
endif()
|
||||||
|
@ -693,8 +689,10 @@ if (LLAMA_SYCL)
|
||||||
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||||
else()
|
else()
|
||||||
|
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||||
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||||
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
|
@ -869,10 +867,6 @@ if (LLAMA_CPU_HBM)
|
||||||
target_link_libraries(ggml PUBLIC memkind)
|
target_link_libraries(ggml PUBLIC memkind)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_PERF)
|
|
||||||
add_compile_definitions(GGML_PERF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
function(get_flags CCID CCVER)
|
function(get_flags CCID CCVER)
|
||||||
set(C_FLAGS "")
|
set(C_FLAGS "")
|
||||||
set(CXX_FLAGS "")
|
set(CXX_FLAGS "")
|
||||||
|
|
|
@ -11,9 +11,21 @@
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "sycl-base",
|
||||||
|
"hidden": true,
|
||||||
|
"generator": "Ninja",
|
||||||
|
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
|
"CMAKE_CXX_COMPILER": "icx",
|
||||||
|
"LLAMA_SYCL": "ON",
|
||||||
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
|
}
|
||||||
|
},
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -35,15 +47,18 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
5
Makefile
5
Makefile
|
@ -344,9 +344,6 @@ ifdef LLAMA_GPROF
|
||||||
MK_CFLAGS += -pg
|
MK_CFLAGS += -pg
|
||||||
MK_CXXFLAGS += -pg
|
MK_CXXFLAGS += -pg
|
||||||
endif
|
endif
|
||||||
ifdef LLAMA_PERF
|
|
||||||
MK_CPPFLAGS += -DGGML_PERF
|
|
||||||
endif
|
|
||||||
|
|
||||||
# Architecture specific
|
# Architecture specific
|
||||||
# TODO: probably these flags need to be tweaked on some architectures
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
|
@ -1051,7 +1048,7 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -410,15 +410,9 @@ Output (example):
|
||||||
|
|
||||||
4. Install build tools
|
4. Install build tools
|
||||||
|
|
||||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||||
|
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||||
|
|
||||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
|
||||||
|
|
||||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
|
||||||
|
|
||||||
- Extract `w64devkit` on your pc.
|
|
||||||
|
|
||||||
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
|
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build llama.cpp
|
||||||
|
|
||||||
|
@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
# Option 2: Or FP16
|
# Option 2: Or FP16
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
cmake --build build --config Release -j
|
cmake --build build --config Release -j
|
||||||
```
|
```
|
||||||
|
@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
||||||
.\examples\sycl\win-build-sycl.bat
|
.\examples\sycl\win-build-sycl.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Or, use CMake presets to build:
|
||||||
|
```sh
|
||||||
|
cmake --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake --preset x64-windows-sycl-debug
|
||||||
|
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||||
|
```
|
||||||
|
|
||||||
|
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
|
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -152,7 +152,6 @@ struct gpt_params {
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
|
||||||
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
|
@ -179,6 +178,12 @@ struct gpt_params {
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// embedding
|
||||||
|
bool embedding = false; // get only sentence embedding
|
||||||
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
@ -377,7 +382,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
|
|
|
@ -214,7 +214,7 @@ src_func = f"""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||||
convert_py = convert_py_pth.read_text()
|
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||||
convert_py = re.sub(
|
convert_py = re.sub(
|
||||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||||
lambda m: m.group(1) + src_func + m.group(3),
|
lambda m: m.group(1) + src_func + m.group(3),
|
||||||
|
@ -222,7 +222,7 @@ convert_py = re.sub(
|
||||||
flags=re.DOTALL | re.MULTILINE,
|
flags=re.DOTALL | re.MULTILINE,
|
||||||
)
|
)
|
||||||
|
|
||||||
convert_py_pth.write_text(convert_py)
|
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
||||||
|
|
||||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,8 @@ class Model:
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
|
||||||
|
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
self.dir_model = dir_model
|
self.dir_model = dir_model
|
||||||
|
@ -96,7 +97,8 @@ class Model:
|
||||||
ftype_lw: str = ftype_up.lower()
|
ftype_lw: str = ftype_up.lower()
|
||||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||||
|
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
|
@ -332,6 +334,8 @@ class Model:
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
|
||||||
def write_vocab(self):
|
def write_vocab(self):
|
||||||
|
if len(self.gguf_writer.tensors) != 1:
|
||||||
|
raise ValueError('Splitting the vocabulary is not supported')
|
||||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||||
self.gguf_writer.write_kv_data_to_file()
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
@ -970,7 +974,11 @@ class XverseModel(Model):
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
|
||||||
|
# because vocab_size is the count of items, and indexes start at 0.
|
||||||
|
max_vocab_index = max(tokenizer.get_vocab().values())
|
||||||
|
if max_vocab_index >= vocab_size:
|
||||||
|
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
||||||
|
|
||||||
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
@ -1403,6 +1411,48 @@ class LlamaModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("BitnetForCausalLM")
|
||||||
|
class BitnetModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.BITNET
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
|
def weight_quant(self, weight):
|
||||||
|
dtype = weight.dtype
|
||||||
|
weight = weight.float()
|
||||||
|
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
||||||
|
weight = (weight * s).round().clamp(-1, 1) / s
|
||||||
|
scale = weight.abs().max().unsqueeze(0)
|
||||||
|
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
||||||
|
weight = torch.sign(weight).type(dtype)
|
||||||
|
return weight.type(dtype), scale.type(torch.float32)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
||||||
|
gguf.MODEL_TENSOR.ATTN_Q,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_K,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_V,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_OUT,
|
||||||
|
gguf.MODEL_TENSOR.FFN_UP,
|
||||||
|
gguf.MODEL_TENSOR.FFN_DOWN,
|
||||||
|
gguf.MODEL_TENSOR.FFN_GATE,
|
||||||
|
]):
|
||||||
|
# transform weight into 1/0/-1 (in fp32)
|
||||||
|
weight_torch, scale_torch = self.weight_quant(data_torch)
|
||||||
|
yield (new_name, weight_torch)
|
||||||
|
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
||||||
|
else:
|
||||||
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GrokForCausalLM")
|
@Model.register("GrokForCausalLM")
|
||||||
class GrokModel(Model):
|
class GrokModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GROK
|
model_arch = gguf.MODEL_ARCH.GROK
|
||||||
|
@ -2728,6 +2778,123 @@ class DeepseekV2Model(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("T5ForConditionalGeneration")
|
||||||
|
@Model.register("T5WithLMHeadModel")
|
||||||
|
class T5Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.T5
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# to avoid TypeError: Descriptors cannot be created directly
|
||||||
|
# exception when importing sentencepiece_model_pb2
|
||||||
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
from sentencepiece import sentencepiece_model_pb2 as model
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'spiece.model'
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||||
|
|
||||||
|
sentencepiece_model = model.ModelProto()
|
||||||
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||||
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||||
|
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||||
|
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||||
|
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor()
|
||||||
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
piece = tokenizer.IdToPiece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.IsUnknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.IsControl(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.IsUnused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.IsByte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
added_tokens_json = json.load(f)
|
||||||
|
for key in added_tokens_json:
|
||||||
|
token_id = added_tokens_json[key]
|
||||||
|
if (token_id >= vocab_size):
|
||||||
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens[token_id] = key.encode("utf-8")
|
||||||
|
scores[token_id] = -1000.0
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
|
||||||
|
if vocab_size > len(tokens):
|
||||||
|
pad_count = vocab_size - len(tokens)
|
||||||
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||||
|
for i in range(1, pad_count + 1):
|
||||||
|
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("t5")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||||
|
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
||||||
|
if precompiled_charsmap:
|
||||||
|
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name("T5")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
||||||
|
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
|
||||||
|
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
|
||||||
|
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
|
||||||
|
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
|
||||||
|
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
@Model.register("ChatGLMModel")
|
@Model.register("ChatGLMModel")
|
||||||
class ChatGLMModel(Model):
|
class ChatGLMModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.CHATGLM
|
model_arch = gguf.MODEL_ARCH.CHATGLM
|
||||||
|
@ -2914,6 +3081,7 @@ class ChatGLMModel(Model):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -2999,10 +3167,44 @@ def parse_args() -> argparse.Namespace:
|
||||||
"--verbose", action="store_true",
|
"--verbose", action="store_true",
|
||||||
help="increase output verbosity",
|
help="increase output verbosity",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-tensors", type=int, default=0,
|
||||||
|
help="max tensors in each split",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-size", type=str, default="0",
|
||||||
|
help="max size per split N(M|G)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run", action="store_true",
|
||||||
|
help="only print out a split plan and exit, without writing any new files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-tensor-first-split", action="store_true",
|
||||||
|
help="do not add tensors to the first split (disabled by default)"
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def split_str_to_n_bytes(split_str: str) -> int:
|
||||||
|
if split_str.endswith("K"):
|
||||||
|
n = int(split_str[:-1]) * 1000
|
||||||
|
elif split_str.endswith("M"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000
|
||||||
|
elif split_str.endswith("G"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
||||||
|
elif split_str.isnumeric():
|
||||||
|
n = int(split_str)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
||||||
|
|
||||||
|
if n < 0:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
||||||
|
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
|
@ -3035,6 +3237,10 @@ def main() -> None:
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
|
||||||
|
logger.error("Error: Cannot use temp file when splitting")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
|
@ -3052,7 +3258,10 @@ def main() -> None:
|
||||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
|
||||||
|
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
|
||||||
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||||
|
small_first_shard=args.no_tensor_first_split)
|
||||||
|
|
||||||
logger.info("Set model parameters")
|
logger.info("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
@ -3063,13 +3272,13 @@ def main() -> None:
|
||||||
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
logger.info("Exporting model vocab...")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
|
logger.info("Model vocab successfully exported.")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Exporting model to '{model_instance.fname_out}'")
|
logger.info("Exporting model...")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
|
logger.info("Model successfully exported.")
|
||||||
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -17,7 +17,7 @@ Related PRs:
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
||||||
|
|
||||||
# With advanced options
|
# With advanced options
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
|
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
|
||||||
|
|
||||||
# To see help message
|
# To see help message
|
||||||
./cvector-generator -h
|
./cvector-generator -h
|
||||||
|
|
|
@ -40,7 +40,7 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
||||||
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
|
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -377,8 +377,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
// create templated prompts
|
// create templated prompts
|
||||||
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
||||||
auto format_template = [](std::string persona, std::string suffix) {
|
auto format_template = [](std::string persona, std::string suffix) {
|
||||||
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
|
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
|
||||||
return persona + " " + suffix;
|
return persona + suffix;
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
||||||
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
[INST] Act like a person who is extremely sad. [/INST]
|
[INST] Act like a person who is extremely sad. [/INST]
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
[INST] Act like a person who is extremely happy. [/INST]
|
[INST] Act like a person who is extremely happy. [/INST]
|
||||||
|
|
|
@ -19,3 +19,43 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
||||||
|
## extra parameters
|
||||||
|
### --embd-normalize $integer$
|
||||||
|
| $integer$ | description | formula |
|
||||||
|
|-----------|---------------------|---------|
|
||||||
|
| $-1$ | none |
|
||||||
|
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
|
||||||
|
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
|
||||||
|
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
|
||||||
|
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
|
||||||
|
|
||||||
|
### --embd-output-format $'string'$
|
||||||
|
| $'string'$ | description | |
|
||||||
|
|------------|------------------------------|--|
|
||||||
|
| '' | same as before | (default)
|
||||||
|
| 'array' | single embeddings | $[[x_1,...,x_n]]$
|
||||||
|
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
|
||||||
|
| 'json' | openai style |
|
||||||
|
| 'json+' | add cosine similarity matrix |
|
||||||
|
|
||||||
|
### --embd-separator $"string"$
|
||||||
|
| $"string"$ | |
|
||||||
|
|--------------|-|
|
||||||
|
| "\n" | (default)
|
||||||
|
| "<#embSep#>" | for exemple
|
||||||
|
| "<#sep#>" | other exemple
|
||||||
|
|
||||||
|
## examples
|
||||||
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -7,23 +7,30 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static std::vector<std::string> split_lines(const std::string & s) {
|
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
|
||||||
std::string line;
|
|
||||||
std::vector<std::string> lines;
|
std::vector<std::string> lines;
|
||||||
std::stringstream ss(s);
|
size_t start = 0;
|
||||||
while (std::getline(ss, line)) {
|
size_t end = s.find(separator);
|
||||||
lines.push_back(line);
|
|
||||||
|
while (end != std::string::npos) {
|
||||||
|
lines.push_back(s.substr(start, end - start));
|
||||||
|
start = end + separator.length();
|
||||||
|
end = s.find(separator, start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lines.push_back(s.substr(start)); // Add the last part
|
||||||
|
|
||||||
return lines;
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
size_t n_tokens = tokens.size();
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
|
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
|
|
||||||
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
||||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||||
if (embd == NULL) {
|
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||||
embd = llama_get_embeddings_ith(ctx, i);
|
|
||||||
if (embd == NULL) {
|
|
||||||
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||||
//TODO: I would also add a parameter here to enable normalization or not.
|
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
||||||
/*fprintf(stdout, "unnormalized_embedding:");
|
|
||||||
for (int hh = 0; hh < n_embd; hh++) {
|
|
||||||
fprintf(stdout, "%9.6f ", embd[hh]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "\n");*/
|
|
||||||
llama_embd_normalize(embd, out, n_embd);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
|
@ -109,7 +110,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
std::vector<std::string> prompts = split_lines(params.prompt);
|
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
const uint64_t n_batch = params.n_batch;
|
const uint64_t n_batch = params.n_batch;
|
||||||
|
@ -169,7 +170,7 @@ int main(int argc, char ** argv) {
|
||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
if (batch.n_tokens + n_toks > n_batch) {
|
if (batch.n_tokens + n_toks > n_batch) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
p += s;
|
p += s;
|
||||||
s = 0;
|
s = 0;
|
||||||
|
@ -182,29 +183,78 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// final batch
|
// final batch
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
|
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
if (params.embd_out.empty()) {
|
||||||
fprintf(stdout, "\n");
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
|
||||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
|
fprintf(stdout, "embedding %d: ", j);
|
||||||
// print cosine similarity matrix
|
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||||
if (n_prompts > 1) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||||
printf("cosine similarity matrix:\n\n");
|
} else {
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
}
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
||||||
fprintf(stdout, "%6.2f ", sim);
|
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// print cosine similarity matrix
|
||||||
|
if (n_prompts > 1) {
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
printf("cosine similarity matrix:\n\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f ", sim);
|
||||||
|
}
|
||||||
|
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||||
|
const bool notArray = params.embd_out != "array";
|
||||||
|
|
||||||
|
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||||
|
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||||
|
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||||
|
fprintf(stdout, "[");
|
||||||
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||||
|
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||||
|
i++;
|
||||||
|
if (i < n_embd) fprintf(stdout, ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "]\n }" : "]");
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
||||||
|
|
||||||
|
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||||
|
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
||||||
|
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
fprintf(stdout, " [");
|
||||||
|
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f", sim);
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, ", "); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, " ]");
|
||||||
|
i++;
|
||||||
|
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n ]");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notArray) fprintf(stdout, "\n}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
|
|
|
@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
|
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
llama_set_embeddings(ctx, true);
|
||||||
llama_set_causal_attn(ctx, false);
|
llama_set_causal_attn(ctx, false);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
|
@ -97,7 +98,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
||||||
const llama_model * mdl = llama_get_model(ctx);
|
const llama_model * mdl = llama_get_model(ctx);
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
llama_set_embeddings(ctx, false);
|
||||||
llama_set_causal_attn(ctx, true);
|
llama_set_causal_attn(ctx, true);
|
||||||
|
|
||||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
||||||
|
@ -165,8 +168,7 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
// create new context - set to embedding mode
|
// create generation context
|
||||||
cparams.embeddings = true;
|
|
||||||
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
||||||
|
|
||||||
// ### Embedding/Representation ###
|
// ### Embedding/Representation ###
|
||||||
|
|
|
@ -131,22 +131,29 @@ class LlamaState: ObservableObject {
|
||||||
|
|
||||||
messageLog += "\(text)"
|
messageLog += "\(text)"
|
||||||
|
|
||||||
while await llamaContext.n_cur < llamaContext.n_len {
|
Task.detached {
|
||||||
let result = await llamaContext.completion_loop()
|
while await llamaContext.n_cur < llamaContext.n_len {
|
||||||
messageLog += "\(result)"
|
let result = await llamaContext.completion_loop()
|
||||||
|
await MainActor.run {
|
||||||
|
self.messageLog += "\(result)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let t_end = DispatchTime.now().uptimeNanoseconds
|
||||||
|
let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
|
||||||
|
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
|
||||||
|
|
||||||
|
await llamaContext.clear()
|
||||||
|
|
||||||
|
await MainActor.run {
|
||||||
|
self.messageLog += """
|
||||||
|
\n
|
||||||
|
Done
|
||||||
|
Heat up took \(t_heat)s
|
||||||
|
Generated \(tokens_per_second) t/s\n
|
||||||
|
"""
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let t_end = DispatchTime.now().uptimeNanoseconds
|
|
||||||
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
|
|
||||||
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
|
|
||||||
|
|
||||||
await llamaContext.clear()
|
|
||||||
messageLog += """
|
|
||||||
\n
|
|
||||||
Done
|
|
||||||
Heat up took \(t_heat)s
|
|
||||||
Generated \(tokens_per_second) t/s\n
|
|
||||||
"""
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func bench() async {
|
func bench() async {
|
||||||
|
|
|
@ -16,41 +16,41 @@ struct quant_option {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
|
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
|
||||||
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
||||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
|
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
|
||||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
|
||||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
|
||||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
|
||||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
||||||
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
||||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
|
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
|
||||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
|
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
||||||
|
|
|
@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
size_t n_tokens = tokens.size();
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
|
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
|
|
|
@ -634,12 +634,12 @@ return html`
|
||||||
<div>
|
<div>
|
||||||
<div class="grammar">
|
<div class="grammar">
|
||||||
<label for="template"></label>
|
<label for="template"></label>
|
||||||
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||||
</div>
|
</div>
|
||||||
<div class="grammar-columns">
|
<div class="grammar-columns">
|
||||||
<div class="json-schema-controls">
|
<div class="json-schema-controls">
|
||||||
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
||||||
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
|
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
:: for FP16
|
:: for FP16
|
||||||
:: faster for long-prompt inference
|
:: faster for long-prompt inference
|
||||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
:: for FP32
|
:: for FP32
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
:: build example/main only
|
:: build example/main only
|
||||||
:: make main
|
:: make main
|
||||||
|
|
||||||
:: build all binary
|
:: build all binary
|
||||||
make -j
|
cmake --build . -j
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
|
|
|
@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
|
||||||
}
|
}
|
||||||
|
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
|
||||||
}
|
}
|
||||||
return row_rounding;
|
return row_rounding;
|
||||||
}
|
}
|
||||||
|
|
|
@ -643,7 +643,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
|
||||||
static constexpr int qi = QI3_S;
|
static constexpr int qi = QI3_S;
|
||||||
};
|
};
|
||||||
|
|
||||||
static int get_mmq_x_max_host(const int cc) {
|
static constexpr int get_mmq_x_max_host(int cc) {
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
#ifdef CUDA_USE_TENSOR_CORES
|
||||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
|
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
|
||||||
#else
|
#else
|
||||||
|
@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Round rows to this value for --split-mode row:
|
// Round rows to this value for --split-mode row:
|
||||||
static int get_mmq_y_host(const int cc, const int mmq_x) {
|
static constexpr int get_mmq_y_host(int cc) {
|
||||||
return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
|
return cc >= CC_VOLTA ? 128 : 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
|
@ -20,6 +20,20 @@ struct mma_int_A_I16K4 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE)
|
||||||
|
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_A_I16K8 {
|
struct mma_int_A_I16K8 {
|
||||||
|
@ -42,6 +56,20 @@ struct mma_int_A_I16K8 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE)
|
||||||
|
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_i(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_B_J8K4 {
|
struct mma_int_B_J8K4 {
|
||||||
|
@ -64,6 +92,20 @@ struct mma_int_B_J8K4 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||||
|
const int * xs = xs0 + (threadIdx.x%J)*stride;
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
|
||||||
|
: "+r"(x[0])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_B_J8K8 {
|
struct mma_int_B_J8K8 {
|
||||||
|
@ -86,6 +128,20 @@ struct mma_int_B_J8K8 {
|
||||||
GGML_CUDA_ASSUME(ret < K);
|
GGML_CUDA_ASSUME(ret < K);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||||
|
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
|
||||||
|
const int * xs = xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
|
||||||
|
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||||
|
: "+r"(x[0]), "+r"(x[1])
|
||||||
|
: "l"(xs));
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < ne; ++l) {
|
||||||
|
x[l] = xs0[get_j(l)*stride + get_k(l)];
|
||||||
|
}
|
||||||
|
#endif // defined(INT8_MMA_AVAILABLE)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mma_int_C_I16J8 {
|
struct mma_int_C_I16J8 {
|
||||||
|
|
|
@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
|
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
|
1775
ggml-cuda/mmq.cuh
1775
ggml-cuda/mmq.cuh
File diff suppressed because it is too large
Load diff
699
ggml-quants.c
699
ggml-quants.c
|
@ -8814,7 +8814,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
|
#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
|
||||||
static const int8_t keven_signs_q2xs[1024] = {
|
static const int8_t keven_signs_q2xs[1024] = {
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
||||||
|
@ -8947,6 +8947,61 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
||||||
|
|
||||||
*s = 0.125f * hsum_float_8(accumf);
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||||
|
|
||||||
|
uint32_t aux32[4];
|
||||||
|
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint16_t * restrict q2 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
|
||||||
|
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
||||||
|
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
||||||
|
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
|
||||||
|
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
|
||||||
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
||||||
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
||||||
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
const uint16_t ls1 = aux32[1] >> 28;
|
||||||
|
const uint16_t ls2 = aux32[3] >> 28;
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const vector int v0 = vec_splats((int32_t)0);
|
const vector int v0 = vec_splats((int32_t)0);
|
||||||
vector float vsumf0 = vec_splats(0.0f);
|
vector float vsumf0 = vec_splats(0.0f);
|
||||||
|
@ -9290,6 +9345,165 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = 0.125f * hsum_float_8(accumf);
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
const __m128i mone = _mm_set1_epi8(1);
|
||||||
|
static const char block_sign_shuffle_mask_1[32] = {
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||||
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
||||||
|
};
|
||||||
|
static const char block_sign_shuffle_mask_2[32] = {
|
||||||
|
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
|
||||||
|
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
|
||||||
|
};
|
||||||
|
static const uint8_t bit_selector_mask_bytes[32] = {
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
};
|
||||||
|
|
||||||
|
const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
|
||||||
|
const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
|
||||||
|
const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
|
||||||
|
const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
|
||||||
|
const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
|
||||||
|
const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
|
||||||
|
|
||||||
|
static const uint8_t k_bit_helper[32] = {
|
||||||
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||||
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||||
|
};
|
||||||
|
const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
||||||
|
const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
|
||||||
|
const __m128i m511 = _mm_set1_epi16(511);
|
||||||
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
||||||
|
const __m128i m1 = _mm_set1_epi8(1);
|
||||||
|
|
||||||
|
uint64_t aux64;
|
||||||
|
|
||||||
|
// somewhat hacky, but gives a significant boost in performance
|
||||||
|
__m256i aux_gindex;
|
||||||
|
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint16_t * restrict q2 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
memcpy(&aux64, x[i].scales, 8);
|
||||||
|
__m128i stmp = _mm_set1_epi64x(aux64);
|
||||||
|
stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
|
||||||
|
const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
|
||||||
|
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
|
||||||
|
|
||||||
|
const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
|
||||||
|
const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
|
||||||
|
aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
|
||||||
|
|
||||||
|
const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
|
||||||
|
const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
|
||||||
|
const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
|
||||||
|
const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
|
||||||
|
const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
|
||||||
|
const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
|
||||||
|
|
||||||
|
const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
|
||||||
|
const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
|
||||||
|
const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
|
||||||
|
const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
|
||||||
|
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
|
||||||
|
const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
|
||||||
|
const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
|
||||||
|
const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
|
||||||
|
const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
|
||||||
|
|
||||||
|
// AVX2 full_signs_1 is full_sign_bits_0 here
|
||||||
|
// AVX2 full_signs_2 is full_sign_bits_1 here
|
||||||
|
__m128i signs_0, signs_1;
|
||||||
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
|
||||||
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
|
||||||
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
||||||
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
||||||
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
|
||||||
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
|
||||||
|
|
||||||
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
|
||||||
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
|
||||||
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
||||||
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
|
||||||
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
|
||||||
|
|
||||||
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
|
||||||
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
|
||||||
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
||||||
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
||||||
|
const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
|
||||||
|
const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
|
||||||
|
|
||||||
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
|
||||||
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
|
||||||
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
||||||
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
||||||
|
const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
|
||||||
|
const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
|
||||||
|
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
|
||||||
|
const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
|
||||||
|
const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
|
||||||
|
const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
|
||||||
|
|
||||||
|
__m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
|
||||||
|
const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
|
||||||
|
const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
||||||
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
|
||||||
|
const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
|
||||||
|
const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
||||||
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
|
||||||
|
const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
|
||||||
|
const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
||||||
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
|
||||||
|
const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
|
||||||
|
const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
||||||
|
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__loongarch_asx)
|
#elif defined(__loongarch_asx)
|
||||||
|
|
||||||
const __m256i mone = __lasx_xvreplgr2vr_b(1);
|
const __m256i mone = __lasx_xvreplgr2vr_b(1);
|
||||||
|
@ -9693,6 +9907,98 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
*s = 0.125f * hsum_float_8(accumf);
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
};
|
||||||
|
|
||||||
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
||||||
|
const __m128i m1 = _mm_set1_epi8(1);
|
||||||
|
|
||||||
|
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
||||||
|
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
||||||
|
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
||||||
|
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
||||||
|
|
||||||
|
uint64_t aux64;
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint8_t * restrict qs = x[i].qs;
|
||||||
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
|
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
memcpy(&aux64, x[i].scales, 8);
|
||||||
|
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
||||||
|
const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
|
||||||
|
const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
|
||||||
|
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
||||||
|
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
||||||
|
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
||||||
|
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
||||||
|
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
|
||||||
|
qs += 8;
|
||||||
|
|
||||||
|
__m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
|
||||||
|
__m128i aux128_1 = aux128_0;
|
||||||
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
||||||
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
||||||
|
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
||||||
|
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
||||||
|
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
||||||
|
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
||||||
|
|
||||||
|
aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
|
||||||
|
aux128_1 = aux128_0;
|
||||||
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
||||||
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
||||||
|
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
||||||
|
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
||||||
|
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
||||||
|
|
||||||
|
signs += 4;
|
||||||
|
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.125f * hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||||
|
@ -10019,6 +10325,63 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
||||||
|
|
||||||
*s = 0.25f * hsum_float_8(accumf);
|
*s = 0.25f * hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||||
|
|
||||||
|
uint32_t aux32[2];
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint8_t * restrict q3 = x[i].qs;
|
||||||
|
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
||||||
|
q3 += 8;
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
||||||
|
q3 += 8;
|
||||||
|
memcpy(aux32, gas, 8); gas += 8;
|
||||||
|
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
||||||
|
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
|
||||||
|
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
||||||
|
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
||||||
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
||||||
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
||||||
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
const uint16_t ls1 = aux32[0] >> 28;
|
||||||
|
const uint16_t ls2 = aux32[1] >> 28;
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = 0.25f * hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||||
|
|
||||||
|
@ -10370,6 +10733,112 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
*s = hsum_float_8(accumf);
|
*s = hsum_float_8(accumf);
|
||||||
|
|
||||||
|
#elif defined(__AVX__)
|
||||||
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||||
|
};
|
||||||
|
|
||||||
|
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
||||||
|
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
||||||
|
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
||||||
|
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
||||||
|
|
||||||
|
const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
|
||||||
|
const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
|
||||||
|
const __m128i idx_mask = _mm_set1_epi32(256);
|
||||||
|
|
||||||
|
typedef union {
|
||||||
|
__m128i vec[4];
|
||||||
|
uint32_t index[16];
|
||||||
|
} index_t;
|
||||||
|
|
||||||
|
index_t idx;
|
||||||
|
|
||||||
|
__m256 accumf = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
const uint8_t * restrict qs = x[i].qs;
|
||||||
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
|
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
|
||||||
|
const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
|
||||||
|
const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
|
||||||
|
idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
|
||||||
|
idx.vec[1] = idx.vec[0];
|
||||||
|
idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
|
||||||
|
idx.vec[3] = idx.vec[2];
|
||||||
|
|
||||||
|
idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
|
||||||
|
idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
|
||||||
|
idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
|
||||||
|
idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
|
||||||
|
|
||||||
|
idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
|
||||||
|
idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
|
||||||
|
idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
|
||||||
|
idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
|
||||||
|
|
||||||
|
const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
|
||||||
|
const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
|
||||||
|
const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
|
||||||
|
const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
|
||||||
|
|
||||||
|
__m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
|
||||||
|
__m128i aux128_1 = aux128_0;
|
||||||
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
||||||
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
||||||
|
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
||||||
|
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
||||||
|
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
||||||
|
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
||||||
|
|
||||||
|
aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
|
||||||
|
aux128_1 = aux128_0;
|
||||||
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
||||||
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
||||||
|
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
||||||
|
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
||||||
|
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
||||||
|
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
||||||
|
|
||||||
|
signs += 4;
|
||||||
|
|
||||||
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
||||||
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
||||||
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
||||||
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
||||||
|
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
||||||
|
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accumf);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||||
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
||||||
|
@ -10607,6 +11076,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX__)
|
||||||
|
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||||
|
const __m128i ax = _mm_sign_epi8(x, x);
|
||||||
|
const __m128i sy = _mm_sign_epi8(y, x);
|
||||||
|
return _mm_maddubs_epi16(ax, sy);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
||||||
const __m256i ax = _mm256_sign_epi8(x, x);
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
||||||
|
@ -10724,6 +11201,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
||||||
|
|
||||||
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||||
|
|
||||||
|
#elif defined __AVX__
|
||||||
|
__m256 accum = _mm256_setzero_ps();
|
||||||
|
float accum1 = 0;
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * qs = x[i].qs;
|
||||||
|
const uint16_t * qh = x[i].qh;
|
||||||
|
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
int sumi1 = 0;
|
||||||
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||||
|
const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
|
||||||
|
const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
|
||||||
|
qs += 8;
|
||||||
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
|
||||||
|
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
||||||
|
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
||||||
|
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
||||||
|
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
||||||
|
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
||||||
|
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
|
||||||
|
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
||||||
|
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
||||||
|
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
|
||||||
|
accum1 += d * sumi1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
||||||
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
||||||
|
@ -11062,6 +11587,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
||||||
|
|
||||||
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
||||||
|
|
||||||
|
#elif defined __AVX__
|
||||||
|
const __m128i mask = _mm_set1_epi16(0x7);
|
||||||
|
const __m128i mone = _mm_set1_epi16(1);
|
||||||
|
|
||||||
|
__m256 accum1 = _mm256_setzero_ps();
|
||||||
|
__m256 accum2 = _mm256_setzero_ps();
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * qs = x[i].qs;
|
||||||
|
const uint8_t * qh = x[i].qh;
|
||||||
|
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||||
|
|
||||||
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||||
|
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||||
|
const __m128i q1b_1_0 = _mm_set_epi64x(
|
||||||
|
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_1_1 = _mm_set_epi64x(
|
||||||
|
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_2_0 = _mm_set_epi64x(
|
||||||
|
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
|
||||||
|
const __m128i q1b_2_1 = _mm_set_epi64x(
|
||||||
|
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
|
||||||
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
|
||||||
|
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
||||||
|
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
||||||
|
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
||||||
|
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
||||||
|
|
||||||
|
const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||||
|
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||||
|
const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||||
|
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||||
|
const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||||
|
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||||
|
const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||||
|
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||||
|
|
||||||
|
const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
|
||||||
|
const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
|
||||||
|
const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
|
||||||
|
const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
|
||||||
|
|
||||||
|
__m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
|
||||||
|
__m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
|
||||||
|
__m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
|
||||||
|
__m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
|
||||||
|
|
||||||
|
scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
|
||||||
|
scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
|
||||||
|
scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
|
||||||
|
scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
|
||||||
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
|
||||||
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
|
||||||
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
|
||||||
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
|
||||||
|
const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
|
||||||
|
const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
|
||||||
|
const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
|
||||||
|
const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
|
||||||
|
|
||||||
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
||||||
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
||||||
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
|
||||||
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
|
||||||
|
|
||||||
|
qs += 8; qh += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
||||||
|
|
||||||
|
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
|
||||||
|
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
int sum1[2], sum2[2], delta[4];
|
int sum1[2], sum2[2], delta[4];
|
||||||
|
@ -11192,6 +11803,44 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||||
|
|
||||||
|
#elif defined __AVX__
|
||||||
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||||
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||||
|
const __m128i mone = _mm_set1_epi16(1);
|
||||||
|
|
||||||
|
__m256 accum1 = _mm256_setzero_ps();
|
||||||
|
__m256 accum2 = _mm256_setzero_ps();
|
||||||
|
for (int ib = 0; ib < nb; ib += 2) {
|
||||||
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
||||||
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
||||||
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
|
||||||
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1);
|
||||||
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs);
|
||||||
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1);
|
||||||
|
|
||||||
|
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
||||||
|
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||||
|
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||||
|
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||||
|
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||||
|
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||||
|
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||||
|
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||||
|
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||||
|
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||||
|
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||||
|
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||||
|
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
||||||
|
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||||
|
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
||||||
|
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||||
|
|
||||||
|
y += 2;
|
||||||
|
x += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||||
const vector signed int v0 = vec_splats((int32_t)0);
|
const vector signed int v0 = vec_splats((int32_t)0);
|
||||||
|
@ -11382,6 +12031,54 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
*s = hsum_float_8(accum);
|
*s = hsum_float_8(accum);
|
||||||
|
|
||||||
|
#elif defined __AVX__
|
||||||
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||||
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||||
|
|
||||||
|
__m256 accum = _mm256_setzero_ps();
|
||||||
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||||
|
const uint8_t * qs = x[ibl].qs;
|
||||||
|
const int8_t * q8 = y[ibl].qs;
|
||||||
|
uint16_t sh = x[ibl].scales_h;
|
||||||
|
__m128i sumi1_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi1_1 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_0 = _mm_setzero_si128();
|
||||||
|
__m128i sumi2_1 = _mm_setzero_si128();
|
||||||
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||||
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
||||||
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
||||||
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||||
|
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
||||||
|
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||||
|
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||||
|
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||||
|
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||||
|
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||||
|
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||||
|
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||||
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
||||||
|
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
||||||
|
sh >>= 4;
|
||||||
|
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
|
||||||
|
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
|
||||||
|
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
|
||||||
|
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
|
||||||
|
sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
|
||||||
|
sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
|
||||||
|
sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
|
||||||
|
sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
|
||||||
|
}
|
||||||
|
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
|
||||||
|
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
|
||||||
|
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||||
|
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = hsum_float_8(accum);
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||||
const vector int v0 = vec_splats((int32_t)0);
|
const vector int v0 = vec_splats((int32_t)0);
|
||||||
|
|
|
@ -4911,7 +4911,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||||
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
||||||
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
GGML_TENSOR_BINARY_OP_LOCALS01;
|
||||||
|
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||||
queue_ptr main_stream = ctx.stream();
|
queue_ptr main_stream = ctx.stream();
|
||||||
|
|
|
@ -588,266 +588,222 @@ namespace dpct
|
||||||
out = prop;
|
out = prop;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// dpct device extension
|
/// dpct device extension
|
||||||
class device_ext : public sycl::device
|
class device_ext : public sycl::device {
|
||||||
{
|
typedef std::mutex mutex_type;
|
||||||
typedef std::mutex mutex_type;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
device_ext() : sycl::device(), _ctx(*this) {}
|
device_ext() : sycl::device() {}
|
||||||
~device_ext()
|
~device_ext() {
|
||||||
{
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
clear_queues();
|
||||||
clear_queues();
|
}
|
||||||
}
|
device_ext(const sycl::device &base) : sycl::device(base) {
|
||||||
device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
{
|
init_queues();
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
}
|
||||||
init_queues();
|
|
||||||
}
|
|
||||||
|
|
||||||
int is_native_atomic_supported() { return 0; }
|
int is_native_atomic_supported() { return 0; }
|
||||||
int get_major_version() const
|
int get_major_version() const { return dpct::get_major_version(*this); }
|
||||||
{
|
|
||||||
return dpct::get_major_version(*this);
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_minor_version() const
|
int get_minor_version() const { return dpct::get_minor_version(*this); }
|
||||||
{
|
|
||||||
return dpct::get_minor_version(*this);
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_max_compute_units() const
|
int get_max_compute_units() const {
|
||||||
{
|
return get_device_info().get_max_compute_units();
|
||||||
return get_device_info().get_max_compute_units();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the maximum clock frequency of this device in KHz.
|
/// Return the maximum clock frequency of this device in KHz.
|
||||||
int get_max_clock_frequency() const
|
int get_max_clock_frequency() const {
|
||||||
{
|
return get_device_info().get_max_clock_frequency();
|
||||||
return get_device_info().get_max_clock_frequency();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int get_integrated() const { return get_device_info().get_integrated(); }
|
int get_integrated() const { return get_device_info().get_integrated(); }
|
||||||
|
|
||||||
int get_max_sub_group_size() const
|
int get_max_sub_group_size() const {
|
||||||
{
|
return get_device_info().get_max_sub_group_size();
|
||||||
return get_device_info().get_max_sub_group_size();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int get_max_register_size_per_work_group() const
|
int get_max_register_size_per_work_group() const {
|
||||||
{
|
return get_device_info().get_max_register_size_per_work_group();
|
||||||
return get_device_info().get_max_register_size_per_work_group();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int get_max_work_group_size() const
|
int get_max_work_group_size() const {
|
||||||
{
|
return get_device_info().get_max_work_group_size();
|
||||||
return get_device_info().get_max_work_group_size();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int get_mem_base_addr_align() const
|
int get_mem_base_addr_align() const {
|
||||||
{
|
return get_info<sycl::info::device::mem_base_addr_align>();
|
||||||
return get_info<sycl::info::device::mem_base_addr_align>();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
size_t get_global_mem_size() const
|
size_t get_global_mem_size() const {
|
||||||
{
|
return get_device_info().get_global_mem_size();
|
||||||
return get_device_info().get_global_mem_size();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
size_t get_max_mem_alloc_size() const
|
size_t get_max_mem_alloc_size() const {
|
||||||
{
|
return get_device_info().get_max_mem_alloc_size();
|
||||||
return get_device_info().get_max_mem_alloc_size();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the number of bytes of free and total memory on the SYCL device.
|
/// Get the number of bytes of free and total memory on the SYCL device.
|
||||||
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
|
/// \param [out] free_memory The number of bytes of free memory on the
|
||||||
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
|
/// SYCL device. \param [out] total_memory The number of bytes of total
|
||||||
void get_memory_info(size_t &free_memory, size_t &total_memory)
|
/// memory on the SYCL device.
|
||||||
{
|
void get_memory_info(size_t &free_memory, size_t &total_memory) {
|
||||||
total_memory = get_device_info().get_global_mem_size();
|
total_memory = get_device_info().get_global_mem_size();
|
||||||
const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not "
|
const char *warning_info =
|
||||||
"supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
|
"get_memory_info: [warning] ext_intel_free_memory is not "
|
||||||
"use total memory as free memory";
|
"supported (export/set ZES_ENABLE_SYSMAN=1 to support), "
|
||||||
|
"use total memory as free memory";
|
||||||
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
|
#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
|
||||||
if (!has(sycl::aspect::ext_intel_free_memory))
|
if (!has(sycl::aspect::ext_intel_free_memory)) {
|
||||||
{
|
std::cerr << warning_info << std::endl;
|
||||||
std::cerr << warning_info << std::endl;
|
free_memory = total_memory;
|
||||||
free_memory = total_memory;
|
} else {
|
||||||
}
|
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
||||||
else
|
}
|
||||||
{
|
|
||||||
free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
std::cerr << warning_info << std::endl;
|
std::cerr << warning_info << std::endl;
|
||||||
free_memory = total_memory;
|
free_memory = total_memory;
|
||||||
#if defined(_MSC_VER) && !defined(__clang__)
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
#pragma message("Querying the number of bytes of free memory is not supported")
|
#pragma message("Querying the number of bytes of free memory is not supported")
|
||||||
#else
|
#else
|
||||||
#warning "Querying the number of bytes of free memory is not supported"
|
#warning "Querying the number of bytes of free memory is not supported"
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_device_info(device_info &out) const {
|
||||||
|
dpct::get_device_info(out, *this);
|
||||||
|
}
|
||||||
|
|
||||||
|
device_info get_device_info() const {
|
||||||
|
device_info prop;
|
||||||
|
dpct::get_device_info(prop, *this);
|
||||||
|
return prop;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset() {
|
||||||
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
|
clear_queues();
|
||||||
|
init_queues();
|
||||||
|
}
|
||||||
|
|
||||||
|
sycl::queue &in_order_queue() { return _q_in_order; }
|
||||||
|
|
||||||
|
sycl::queue &out_of_order_queue() { return _q_out_of_order; }
|
||||||
|
|
||||||
|
sycl::queue &default_queue() { return in_order_queue(); }
|
||||||
|
|
||||||
|
void queues_wait_and_throw() {
|
||||||
|
std::unique_lock<mutex_type> lock(m_mutex);
|
||||||
|
lock.unlock();
|
||||||
|
for (auto &q : _queues) {
|
||||||
|
q.wait_and_throw();
|
||||||
}
|
}
|
||||||
|
// Guard the destruct of current_queues to make sure the ref count is
|
||||||
|
// safe.
|
||||||
|
lock.lock();
|
||||||
|
}
|
||||||
|
|
||||||
void get_device_info(device_info &out) const
|
sycl::queue create_queue(bool enable_exception_handler = false) {
|
||||||
{
|
return create_in_order_queue(enable_exception_handler);
|
||||||
dpct::get_device_info(out, *this);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
device_info get_device_info() const
|
sycl::queue create_queue(sycl::device device,
|
||||||
{
|
bool enable_exception_handler = false) {
|
||||||
device_info prop;
|
return create_in_order_queue(device, enable_exception_handler);
|
||||||
dpct::get_device_info(prop, *this);
|
}
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
|
|
||||||
void reset()
|
sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
|
||||||
{
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
return create_queue_impl(enable_exception_handler,
|
||||||
clear_queues();
|
sycl::property::queue::in_order());
|
||||||
init_queues();
|
}
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue &in_order_queue() { return *_q_in_order; }
|
sycl::queue create_in_order_queue(sycl::device device,
|
||||||
|
|
||||||
sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
|
|
||||||
|
|
||||||
sycl::queue &default_queue()
|
|
||||||
{
|
|
||||||
return in_order_queue();
|
|
||||||
}
|
|
||||||
|
|
||||||
void queues_wait_and_throw()
|
|
||||||
{
|
|
||||||
std::unique_lock<mutex_type> lock(m_mutex);
|
|
||||||
std::vector<std::shared_ptr<sycl::queue>> current_queues(
|
|
||||||
_queues);
|
|
||||||
lock.unlock();
|
|
||||||
for (const auto &q : current_queues)
|
|
||||||
{
|
|
||||||
q->wait_and_throw();
|
|
||||||
}
|
|
||||||
// Guard the destruct of current_queues to make sure the ref count is safe.
|
|
||||||
lock.lock();
|
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue *create_queue(bool enable_exception_handler = false)
|
|
||||||
{
|
|
||||||
return create_in_order_queue(enable_exception_handler);
|
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue *create_queue(sycl::context context, sycl::device device,
|
|
||||||
bool enable_exception_handler = false) {
|
|
||||||
return create_in_order_queue(context, device, enable_exception_handler);
|
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
|
||||||
return create_queue_impl(enable_exception_handler,
|
|
||||||
sycl::property::queue::in_order());
|
|
||||||
}
|
|
||||||
|
|
||||||
sycl::queue *create_in_order_queue(sycl::context context, sycl::device device,
|
|
||||||
bool enable_exception_handler = false) {
|
bool enable_exception_handler = false) {
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
return create_queue_impl(context, device, enable_exception_handler,
|
return create_queue_impl(device, enable_exception_handler,
|
||||||
sycl::property::queue::in_order());
|
sycl::property::queue::in_order());
|
||||||
}
|
}
|
||||||
|
|
||||||
sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
|
sycl::queue create_out_of_order_queue(
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
bool enable_exception_handler = false) {
|
||||||
return create_queue_impl(enable_exception_handler);
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
}
|
return create_queue_impl(enable_exception_handler);
|
||||||
|
}
|
||||||
|
|
||||||
void destroy_queue(sycl::queue *&queue)
|
void destroy_queue(sycl::queue queue) {
|
||||||
{
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
_queues.clear();
|
||||||
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
}
|
||||||
[=](const std::shared_ptr<sycl::queue> &q) -> bool
|
void set_saved_queue(sycl::queue q) {
|
||||||
{
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
return q.get() == queue;
|
_saved_queue = q;
|
||||||
}),
|
}
|
||||||
_queues.end());
|
sycl::queue get_saved_queue() const {
|
||||||
queue = nullptr;
|
std::lock_guard<mutex_type> lock(m_mutex);
|
||||||
}
|
return _saved_queue;
|
||||||
void set_saved_queue(sycl::queue *q)
|
}
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
|
||||||
_saved_queue = q;
|
|
||||||
}
|
|
||||||
sycl::queue *get_saved_queue() const
|
|
||||||
{
|
|
||||||
std::lock_guard<mutex_type> lock(m_mutex);
|
|
||||||
return _saved_queue;
|
|
||||||
}
|
|
||||||
sycl::context get_context() const { return _ctx; }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void clear_queues()
|
void clear_queues() { _queues.clear(); }
|
||||||
{
|
|
||||||
_queues.clear();
|
|
||||||
_q_in_order = _q_out_of_order = _saved_queue = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void init_queues()
|
void init_queues() {
|
||||||
{
|
_q_in_order =
|
||||||
_q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
|
create_queue_impl(true, sycl::property::queue::in_order());
|
||||||
_q_out_of_order = create_queue_impl(true);
|
_q_out_of_order = create_queue_impl(true);
|
||||||
_saved_queue = &default_queue();
|
_saved_queue = default_queue();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Caller should acquire resource \p m_mutex before calling this function.
|
/// Caller should acquire resource \p m_mutex before calling this
|
||||||
template <class... Properties>
|
/// function.
|
||||||
sycl::queue *create_queue_impl(bool enable_exception_handler,
|
template <class... Properties>
|
||||||
Properties... properties)
|
sycl::queue create_queue_impl(bool enable_exception_handler,
|
||||||
{
|
Properties... properties) {
|
||||||
sycl::async_handler eh = {};
|
sycl::async_handler eh = {};
|
||||||
if (enable_exception_handler)
|
if (enable_exception_handler) {
|
||||||
{
|
eh = exception_handler;
|
||||||
eh = exception_handler;
|
}
|
||||||
}
|
auto q = sycl::queue(*this, eh,
|
||||||
_queues.push_back(std::make_shared<sycl::queue>(
|
sycl::property_list(
|
||||||
_ctx, *this, eh,
|
|
||||||
sycl::property_list(
|
|
||||||
#ifdef DPCT_PROFILING_ENABLED
|
#ifdef DPCT_PROFILING_ENABLED
|
||||||
sycl::property::queue::enable_profiling(),
|
sycl::property::queue::enable_profiling(),
|
||||||
#endif
|
#endif
|
||||||
properties...)));
|
properties...));
|
||||||
|
_queues.push_back(q);
|
||||||
|
|
||||||
return _queues.back().get();
|
return _queues.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class... Properties>
|
template <class... Properties>
|
||||||
sycl::queue *create_queue_impl(sycl::context context, sycl::device device,
|
sycl::queue create_queue_impl(sycl::device device,
|
||||||
bool enable_exception_handler,
|
bool enable_exception_handler,
|
||||||
Properties... properties) {
|
Properties... properties) {
|
||||||
sycl::async_handler eh = {};
|
sycl::async_handler eh = {};
|
||||||
if (enable_exception_handler) {
|
if (enable_exception_handler) {
|
||||||
eh = exception_handler;
|
eh = exception_handler;
|
||||||
}
|
|
||||||
_queues.push_back(std::make_shared<sycl::queue>(
|
|
||||||
context, device, eh,
|
|
||||||
sycl::property_list(
|
|
||||||
#ifdef DPCT_PROFILING_ENABLED
|
|
||||||
sycl::property::queue::enable_profiling(),
|
|
||||||
#endif
|
|
||||||
properties...)));
|
|
||||||
|
|
||||||
return _queues.back().get();
|
|
||||||
}
|
}
|
||||||
|
_queues.push_back(
|
||||||
|
sycl::queue(device, eh,
|
||||||
|
sycl::property_list(
|
||||||
|
#ifdef DPCT_PROFILING_ENABLED
|
||||||
|
sycl::property::queue::enable_profiling(),
|
||||||
|
#endif
|
||||||
|
properties...)));
|
||||||
|
|
||||||
void get_version(int &major, int &minor) const
|
return _queues.back();
|
||||||
{
|
}
|
||||||
detail::get_version(*this, major, minor);
|
|
||||||
}
|
void get_version(int &major, int &minor) const {
|
||||||
sycl::queue *_q_in_order, *_q_out_of_order;
|
detail::get_version(*this, major, minor);
|
||||||
sycl::queue *_saved_queue;
|
}
|
||||||
sycl::context _ctx;
|
sycl::queue _q_in_order, _q_out_of_order;
|
||||||
std::vector<std::shared_ptr<sycl::queue>> _queues;
|
sycl::queue _saved_queue;
|
||||||
mutable mutex_type m_mutex;
|
std::vector<sycl::queue> _queues;
|
||||||
|
mutable mutex_type m_mutex;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/// device manager
|
/// device manager
|
||||||
class dev_mgr
|
class dev_mgr
|
||||||
{
|
{
|
||||||
|
|
39661
ggml-vulkan-shaders.hpp
39661
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load diff
2091
ggml-vulkan.cpp
2091
ggml-vulkan.cpp
File diff suppressed because it is too large
Load diff
41
ggml.h
41
ggml.h
|
@ -312,6 +312,12 @@
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||||
|
|
||||||
|
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -585,11 +591,7 @@ extern "C" {
|
||||||
struct ggml_tensor * grad;
|
struct ggml_tensor * grad;
|
||||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||||
|
|
||||||
// performance
|
// source tensor and offset for views
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
|
|
||||||
struct ggml_tensor * view_src;
|
struct ggml_tensor * view_src;
|
||||||
size_t view_offs;
|
size_t view_offs;
|
||||||
|
|
||||||
|
@ -599,7 +601,7 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[8];
|
// char padding[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -646,11 +648,6 @@ extern "C" {
|
||||||
struct ggml_hash_set visited_hash_table;
|
struct ggml_hash_set visited_hash_table;
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
// performance
|
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
|
@ -667,28 +664,6 @@ extern "C" {
|
||||||
bool no_alloc; // don't allocate memory for the tensor data
|
bool no_alloc; // don't allocate memory for the tensor data
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// compute types
|
|
||||||
|
|
||||||
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
|
||||||
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
|
||||||
enum ggml_task_type {
|
|
||||||
GGML_TASK_TYPE_INIT = 0,
|
|
||||||
GGML_TASK_TYPE_COMPUTE,
|
|
||||||
GGML_TASK_TYPE_FINALIZE,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_compute_params {
|
|
||||||
enum ggml_task_type type;
|
|
||||||
|
|
||||||
// ith = thread index, nth = number of threads
|
|
||||||
int ith, nth;
|
|
||||||
|
|
||||||
// work buffer for all threads
|
|
||||||
size_t wsize;
|
|
||||||
void * wdata;
|
|
||||||
};
|
|
||||||
|
|
||||||
// numa strategies
|
// numa strategies
|
||||||
enum ggml_numa_strategy {
|
enum ggml_numa_strategy {
|
||||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
|
|
@ -49,6 +49,7 @@ class Keys:
|
||||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -62,6 +63,7 @@ class Keys:
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -73,6 +75,11 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||||
|
|
||||||
|
class Split:
|
||||||
|
LLM_KV_SPLIT_NO = "split.no"
|
||||||
|
LLM_KV_SPLIT_COUNT = "split.count"
|
||||||
|
LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
|
||||||
|
|
||||||
class SSM:
|
class SSM:
|
||||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||||
INNER_SIZE = "{arch}.ssm.inner_size"
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||||
|
@ -97,6 +104,8 @@ class Keys:
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||||
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
||||||
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
@ -117,18 +126,18 @@ class Keys:
|
||||||
|
|
||||||
|
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
LLAMA = auto()
|
LLAMA = auto()
|
||||||
FALCON = auto()
|
FALCON = auto()
|
||||||
BAICHUAN = auto()
|
BAICHUAN = auto()
|
||||||
GROK = auto()
|
GROK = auto()
|
||||||
GPT2 = auto()
|
GPT2 = auto()
|
||||||
GPTJ = auto()
|
GPTJ = auto()
|
||||||
GPTNEOX = auto()
|
GPTNEOX = auto()
|
||||||
MPT = auto()
|
MPT = auto()
|
||||||
STARCODER = auto()
|
STARCODER = auto()
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
JINA_BERT_V2 = auto()
|
JINA_BERT_V2 = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
|
@ -152,57 +161,88 @@ class MODEL_ARCH(IntEnum):
|
||||||
ARCTIC = auto()
|
ARCTIC = auto()
|
||||||
DEEPSEEK2 = auto()
|
DEEPSEEK2 = auto()
|
||||||
CHATGLM = auto()
|
CHATGLM = auto()
|
||||||
|
BITNET = auto()
|
||||||
|
T5 = auto()
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD = auto()
|
TOKEN_EMBD = auto()
|
||||||
TOKEN_EMBD_NORM = auto()
|
TOKEN_EMBD_NORM = auto()
|
||||||
TOKEN_TYPES = auto()
|
TOKEN_TYPES = auto()
|
||||||
POS_EMBD = auto()
|
POS_EMBD = auto()
|
||||||
OUTPUT = auto()
|
OUTPUT = auto()
|
||||||
OUTPUT_NORM = auto()
|
OUTPUT_NORM = auto()
|
||||||
ROPE_FREQS = auto()
|
ROPE_FREQS = auto()
|
||||||
ROPE_FACTORS_LONG = auto()
|
ROPE_FACTORS_LONG = auto()
|
||||||
ROPE_FACTORS_SHORT = auto()
|
ROPE_FACTORS_SHORT = auto()
|
||||||
ATTN_Q = auto()
|
ATTN_Q = auto()
|
||||||
ATTN_K = auto()
|
ATTN_K = auto()
|
||||||
ATTN_V = auto()
|
ATTN_V = auto()
|
||||||
ATTN_QKV = auto()
|
ATTN_QKV = auto()
|
||||||
ATTN_OUT = auto()
|
ATTN_OUT = auto()
|
||||||
ATTN_NORM = auto()
|
ATTN_NORM = auto()
|
||||||
ATTN_NORM_2 = auto()
|
ATTN_NORM_2 = auto()
|
||||||
ATTN_OUT_NORM = auto()
|
ATTN_OUT_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
FFN_GATE_INP_SHEXP = auto()
|
FFN_GATE_INP_SHEXP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
FFN_UP = auto()
|
FFN_UP = auto()
|
||||||
FFN_ACT = auto()
|
FFN_ACT = auto()
|
||||||
FFN_NORM_EXP = auto()
|
FFN_NORM_EXP = auto()
|
||||||
FFN_GATE_EXP = auto()
|
FFN_GATE_EXP = auto()
|
||||||
FFN_DOWN_EXP = auto()
|
FFN_DOWN_EXP = auto()
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
FFN_GATE_SHEXP = auto()
|
FFN_GATE_SHEXP = auto()
|
||||||
FFN_DOWN_SHEXP = auto()
|
FFN_DOWN_SHEXP = auto()
|
||||||
FFN_UP_SHEXP = auto()
|
FFN_UP_SHEXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
SSM_IN = auto()
|
SSM_IN = auto()
|
||||||
SSM_CONV1D = auto()
|
SSM_CONV1D = auto()
|
||||||
SSM_X = auto()
|
SSM_X = auto()
|
||||||
SSM_DT = auto()
|
SSM_DT = auto()
|
||||||
SSM_A = auto()
|
SSM_A = auto()
|
||||||
SSM_D = auto()
|
SSM_D = auto()
|
||||||
SSM_OUT = auto()
|
SSM_OUT = auto()
|
||||||
ATTN_Q_A = auto()
|
ATTN_Q_A = auto()
|
||||||
ATTN_Q_B = auto()
|
ATTN_Q_B = auto()
|
||||||
ATTN_KV_A_MQA = auto()
|
ATTN_KV_A_MQA = auto()
|
||||||
ATTN_KV_B = auto()
|
ATTN_KV_B = auto()
|
||||||
ATTN_Q_A_NORM = auto()
|
ATTN_Q_A_NORM = auto()
|
||||||
ATTN_KV_A_NORM = auto()
|
ATTN_KV_A_NORM = auto()
|
||||||
|
FFN_SUB_NORM = auto()
|
||||||
|
ATTN_SUB_NORM = auto()
|
||||||
|
DEC_ATTN_NORM = auto()
|
||||||
|
DEC_ATTN_Q = auto()
|
||||||
|
DEC_ATTN_K = auto()
|
||||||
|
DEC_ATTN_V = auto()
|
||||||
|
DEC_ATTN_OUT = auto()
|
||||||
|
DEC_ATTN_REL_B = auto()
|
||||||
|
DEC_CROSS_ATTN_NORM = auto()
|
||||||
|
DEC_CROSS_ATTN_Q = auto()
|
||||||
|
DEC_CROSS_ATTN_K = auto()
|
||||||
|
DEC_CROSS_ATTN_V = auto()
|
||||||
|
DEC_CROSS_ATTN_OUT = auto()
|
||||||
|
DEC_CROSS_ATTN_REL_B = auto()
|
||||||
|
DEC_FFN_NORM = auto()
|
||||||
|
DEC_FFN_GATE = auto()
|
||||||
|
DEC_FFN_DOWN = auto()
|
||||||
|
DEC_FFN_UP = auto()
|
||||||
|
DEC_OUTPUT_NORM = auto()
|
||||||
|
ENC_ATTN_NORM = auto()
|
||||||
|
ENC_ATTN_Q = auto()
|
||||||
|
ENC_ATTN_K = auto()
|
||||||
|
ENC_ATTN_V = auto()
|
||||||
|
ENC_ATTN_OUT = auto()
|
||||||
|
ENC_ATTN_REL_B = auto()
|
||||||
|
ENC_FFN_NORM = auto()
|
||||||
|
ENC_FFN_GATE = auto()
|
||||||
|
ENC_FFN_DOWN = auto()
|
||||||
|
ENC_FFN_UP = auto()
|
||||||
|
ENC_OUTPUT_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -241,57 +281,89 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.ARCTIC: "arctic",
|
MODEL_ARCH.ARCTIC: "arctic",
|
||||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
MODEL_ARCH.CHATGLM: "chatglm",
|
MODEL_ARCH.CHATGLM: "chatglm",
|
||||||
|
MODEL_ARCH.BITNET: "bitnet",
|
||||||
|
MODEL_ARCH.T5: "t5",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||||
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||||
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -824,6 +896,53 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BITNET: [
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM,
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.T5: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM,
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP,
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B,
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM,
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP,
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ import struct
|
||||||
import tempfile
|
import tempfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
|
from pathlib import Path
|
||||||
from io import BufferedWriter
|
from io import BufferedWriter
|
||||||
from typing import IO, Any, Sequence, Mapping
|
from typing import IO, Any, Sequence, Mapping
|
||||||
from string import ascii_letters, digits
|
from string import ascii_letters, digits
|
||||||
|
@ -31,6 +32,9 @@ from .quants import quant_shape_from_byte_shape
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TensorInfo:
|
class TensorInfo:
|
||||||
shape: Sequence[int]
|
shape: Sequence[int]
|
||||||
|
@ -55,11 +59,11 @@ class WriterState(Enum):
|
||||||
|
|
||||||
|
|
||||||
class GGUFWriter:
|
class GGUFWriter:
|
||||||
fout: BufferedWriter | None
|
fout: list[BufferedWriter] | None
|
||||||
path: os.PathLike[str] | str | None
|
path: Path | None
|
||||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
||||||
tensors: dict[str, TensorInfo]
|
tensors: list[dict[str, TensorInfo]]
|
||||||
kv_data: dict[str, GGUFValue]
|
kv_data: list[dict[str, GGUFValue]]
|
||||||
state: WriterState
|
state: WriterState
|
||||||
_simple_value_packing = {
|
_simple_value_packing = {
|
||||||
GGUFValueType.UINT8: "B",
|
GGUFValueType.UINT8: "B",
|
||||||
|
@ -76,26 +80,38 @@ class GGUFWriter:
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False,
|
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
|
||||||
endianess: GGUFEndian = GGUFEndian.LITTLE,
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
|
||||||
):
|
):
|
||||||
self.fout = None
|
self.fout = None
|
||||||
self.path = path
|
self.path = Path(path) if path else None
|
||||||
self.arch = arch
|
self.arch = arch
|
||||||
self.endianess = endianess
|
self.endianess = endianess
|
||||||
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||||
self.use_temp_file = use_temp_file
|
self.use_temp_file = use_temp_file
|
||||||
self.temp_file = None
|
self.temp_file = None
|
||||||
self.tensors = dict()
|
self.tensors = [{}]
|
||||||
self.kv_data = dict()
|
self.kv_data = [{}]
|
||||||
|
self.split_max_tensors = split_max_tensors
|
||||||
|
self.split_max_size = split_max_size
|
||||||
|
self.dry_run = dry_run
|
||||||
|
self.small_first_shard = small_first_shard
|
||||||
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
||||||
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
||||||
))
|
))
|
||||||
self.state = WriterState.NO_FILE
|
self.state = WriterState.NO_FILE
|
||||||
|
|
||||||
|
if self.small_first_shard:
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
self.add_architecture()
|
self.add_architecture()
|
||||||
|
|
||||||
def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None:
|
def format_shard_names(self, path: Path) -> list[Path]:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return [path]
|
||||||
|
return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
|
||||||
|
|
||||||
|
def open_output_file(self, path: Path | None = None) -> None:
|
||||||
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
||||||
# allow calling this multiple times as long as the path is the same
|
# allow calling this multiple times as long as the path is the same
|
||||||
return
|
return
|
||||||
|
@ -106,22 +122,58 @@ class GGUFWriter:
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
if self.path is not None:
|
if self.path is not None:
|
||||||
if self.fout is not None:
|
filenames = self.print_plan()
|
||||||
self.fout.close()
|
self.fout = [open(filename, "wb") for filename in filenames]
|
||||||
self.fout = open(self.path, "wb")
|
|
||||||
self.state = WriterState.EMPTY
|
self.state = WriterState.EMPTY
|
||||||
|
|
||||||
def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None:
|
def print_plan(self) -> list[Path]:
|
||||||
|
logger.info("Writing the following files:")
|
||||||
|
assert self.path is not None
|
||||||
|
filenames = self.format_shard_names(self.path)
|
||||||
|
assert len(filenames) == len(self.tensors)
|
||||||
|
for name, tensors in zip(filenames, self.tensors):
|
||||||
|
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
|
||||||
|
|
||||||
|
if self.dry_run:
|
||||||
|
logger.info("Dry run, not writing files")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
return filenames
|
||||||
|
|
||||||
|
def add_shard_kv_data(self) -> None:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
total_tensors = sum(len(t) for t in self.tensors)
|
||||||
|
assert self.fout is not None
|
||||||
|
total_splits = len(self.fout)
|
||||||
|
self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
|
||||||
|
for i, kv_data in enumerate(self.kv_data):
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
|
||||||
|
|
||||||
|
def write_header_to_file(self, path: Path | None = None) -> None:
|
||||||
|
if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
|
||||||
|
logger.warning("Model fails split requirements, not splitting")
|
||||||
|
|
||||||
self.open_output_file(path)
|
self.open_output_file(path)
|
||||||
|
|
||||||
if self.state is not WriterState.EMPTY:
|
if self.state is not WriterState.EMPTY:
|
||||||
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
||||||
|
|
||||||
self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
|
assert self.fout is not None
|
||||||
self._write_packed("I", GGUF_VERSION)
|
assert len(self.fout) == len(self.tensors)
|
||||||
self._write_packed("Q", len(self.tensors))
|
assert len(self.kv_data) == 1
|
||||||
self._write_packed("Q", len(self.kv_data))
|
|
||||||
self.flush()
|
self.add_shard_kv_data()
|
||||||
|
|
||||||
|
for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
|
||||||
|
fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
|
||||||
|
fout.write(self._pack("I", GGUF_VERSION))
|
||||||
|
fout.write(self._pack("Q", len(tensors)))
|
||||||
|
fout.write(self._pack("Q", len(kv_data)))
|
||||||
|
fout.flush()
|
||||||
self.state = WriterState.HEADER
|
self.state = WriterState.HEADER
|
||||||
|
|
||||||
def write_kv_data_to_file(self) -> None:
|
def write_kv_data_to_file(self) -> None:
|
||||||
|
@ -129,13 +181,15 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
kv_data = bytearray()
|
for fout, kv_data in zip(self.fout, self.kv_data):
|
||||||
|
kv_bytes = bytearray()
|
||||||
|
|
||||||
for key, val in self.kv_data.items():
|
for key, val in kv_data.items():
|
||||||
kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
||||||
kv_data += self._pack_val(val.value, val.type, add_vtype=True)
|
kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
|
||||||
|
|
||||||
|
fout.write(kv_bytes)
|
||||||
|
|
||||||
self.fout.write(kv_data)
|
|
||||||
self.flush()
|
self.flush()
|
||||||
self.state = WriterState.KV_DATA
|
self.state = WriterState.KV_DATA
|
||||||
|
|
||||||
|
@ -144,28 +198,29 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
ti_data = bytearray()
|
for fout, tensors in zip(self.fout, self.tensors):
|
||||||
offset_tensor = 0
|
ti_data = bytearray()
|
||||||
|
offset_tensor = 0
|
||||||
|
|
||||||
for name, ti in self.tensors.items():
|
for name, ti in tensors.items():
|
||||||
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
||||||
n_dims = len(ti.shape)
|
n_dims = len(ti.shape)
|
||||||
ti_data += self._pack("I", n_dims)
|
ti_data += self._pack("I", n_dims)
|
||||||
for i in range(n_dims):
|
for j in range(n_dims):
|
||||||
ti_data += self._pack("Q", ti.shape[n_dims - 1 - i])
|
ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
|
||||||
ti_data += self._pack("I", ti.dtype)
|
ti_data += self._pack("I", ti.dtype)
|
||||||
ti_data += self._pack("Q", offset_tensor)
|
ti_data += self._pack("Q", offset_tensor)
|
||||||
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
||||||
|
|
||||||
self.fout.write(ti_data)
|
fout.write(ti_data)
|
||||||
self.flush()
|
fout.flush()
|
||||||
self.state = WriterState.TI_DATA
|
self.state = WriterState.TI_DATA
|
||||||
|
|
||||||
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
||||||
if key in self.kv_data:
|
if any(key in kv_data for kv_data in self.kv_data):
|
||||||
raise ValueError(f'Duplicated key name {key!r}')
|
raise ValueError(f'Duplicated key name {key!r}')
|
||||||
|
|
||||||
self.kv_data[key] = GGUFValue(value=val, type=vtype)
|
self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
|
||||||
|
|
||||||
def add_uint8(self, key: str, val: int) -> None:
|
def add_uint8(self, key: str, val: int) -> None:
|
||||||
self.add_key_value(key,val, GGUFValueType.UINT8)
|
self.add_key_value(key,val, GGUFValueType.UINT8)
|
||||||
|
@ -206,9 +261,6 @@ class GGUFWriter:
|
||||||
self.add_key_value(key, val, GGUFValueType.STRING)
|
self.add_key_value(key, val, GGUFValueType.STRING)
|
||||||
|
|
||||||
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
||||||
if not isinstance(val, Sequence):
|
|
||||||
raise ValueError("Value must be a sequence for array type")
|
|
||||||
|
|
||||||
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -222,7 +274,7 @@ class GGUFWriter:
|
||||||
if self.state is not WriterState.NO_FILE:
|
if self.state is not WriterState.NO_FILE:
|
||||||
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
||||||
|
|
||||||
if name in self.tensors:
|
if any(name in tensors for tensors in self.tensors):
|
||||||
raise ValueError(f'Duplicated tensor name {name!r}')
|
raise ValueError(f'Duplicated tensor name {name!r}')
|
||||||
|
|
||||||
if raw_dtype is None:
|
if raw_dtype is None:
|
||||||
|
@ -247,7 +299,18 @@ class GGUFWriter:
|
||||||
if tensor_dtype == np.uint8:
|
if tensor_dtype == np.uint8:
|
||||||
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
|
|
||||||
self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
# make sure there is at least one tensor before splitting
|
||||||
|
if len(self.tensors[-1]) > 0:
|
||||||
|
if ( # split when over tensor limit
|
||||||
|
self.split_max_tensors != 0
|
||||||
|
and len(self.tensors[-1]) >= self.split_max_tensors
|
||||||
|
) or ( # split when over size limit
|
||||||
|
self.split_max_size != 0
|
||||||
|
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
|
||||||
|
):
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
|
self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
||||||
|
|
||||||
def add_tensor(
|
def add_tensor(
|
||||||
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
||||||
|
@ -264,7 +327,7 @@ class GGUFWriter:
|
||||||
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
self.tensors[name].tensor = tensor
|
self.tensors[-1][name].tensor = tensor
|
||||||
return
|
return
|
||||||
|
|
||||||
tensor.tofile(self.temp_file)
|
tensor.tofile(self.temp_file)
|
||||||
|
@ -282,9 +345,24 @@ class GGUFWriter:
|
||||||
|
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
tensor.byteswap(inplace=True)
|
tensor.byteswap(inplace=True)
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
|
||||||
tensor.tofile(self.fout)
|
file_id = -1
|
||||||
self.write_padding(self.fout, tensor.nbytes)
|
for i, tensors in enumerate(self.tensors):
|
||||||
|
if len(tensors) > 0:
|
||||||
|
file_id = i
|
||||||
|
break
|
||||||
|
|
||||||
|
fout = self.fout[file_id]
|
||||||
|
|
||||||
|
# pop the first tensor info
|
||||||
|
# TODO: cleaner way to get the first key
|
||||||
|
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
|
||||||
|
ti = self.tensors[file_id].pop(first_tensor_name)
|
||||||
|
assert ti.nbytes == tensor.nbytes
|
||||||
|
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
tensor.tofile(fout)
|
||||||
|
self.write_padding(fout, tensor.nbytes)
|
||||||
|
|
||||||
self.state = WriterState.WEIGHTS
|
self.state = WriterState.WEIGHTS
|
||||||
|
|
||||||
|
@ -293,31 +371,43 @@ class GGUFWriter:
|
||||||
|
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
|
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
for fout in self.fout:
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
|
shard_bar = None
|
||||||
bar = None
|
bar = None
|
||||||
|
|
||||||
if progress:
|
if progress:
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
total_bytes = sum(t.nbytes for t in self.tensors.values())
|
total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
|
||||||
|
|
||||||
|
if len(self.fout) > 1:
|
||||||
|
shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
|
||||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
|
||||||
for ti in self.tensors.values():
|
if shard_bar is not None:
|
||||||
assert ti.tensor is not None # can only iterate once over the tensors
|
shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
|
||||||
assert ti.tensor.nbytes == ti.nbytes
|
total = sum(ti.nbytes for ti in tensors.values())
|
||||||
ti.tensor.tofile(self.fout)
|
shard_bar.reset(total=(total if total > 0 else None))
|
||||||
if bar is not None:
|
|
||||||
bar.update(ti.nbytes)
|
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
||||||
self.write_padding(self.fout, ti.nbytes)
|
for ti in tensors.values():
|
||||||
ti.tensor = None
|
assert ti.tensor is not None # can only iterate once over the tensors
|
||||||
|
assert ti.tensor.nbytes == ti.nbytes
|
||||||
|
ti.tensor.tofile(fout)
|
||||||
|
if shard_bar is not None:
|
||||||
|
shard_bar.update(ti.nbytes)
|
||||||
|
if bar is not None:
|
||||||
|
bar.update(ti.nbytes)
|
||||||
|
self.write_padding(fout, ti.nbytes)
|
||||||
|
ti.tensor = None
|
||||||
else:
|
else:
|
||||||
self.temp_file.seek(0)
|
self.temp_file.seek(0)
|
||||||
|
|
||||||
shutil.copyfileobj(self.temp_file, self.fout)
|
shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
|
||||||
self.flush()
|
self.flush()
|
||||||
self.temp_file.close()
|
self.temp_file.close()
|
||||||
|
|
||||||
|
@ -325,11 +415,13 @@ class GGUFWriter:
|
||||||
|
|
||||||
def flush(self) -> None:
|
def flush(self) -> None:
|
||||||
assert self.fout is not None
|
assert self.fout is not None
|
||||||
self.fout.flush()
|
for fout in self.fout:
|
||||||
|
fout.flush()
|
||||||
|
|
||||||
def close(self) -> None:
|
def close(self) -> None:
|
||||||
if self.fout is not None:
|
if self.fout is not None:
|
||||||
self.fout.close()
|
for fout in self.fout:
|
||||||
|
fout.close()
|
||||||
self.fout = None
|
self.fout = None
|
||||||
|
|
||||||
def add_architecture(self) -> None:
|
def add_architecture(self) -> None:
|
||||||
|
@ -400,6 +492,9 @@ class GGUFWriter:
|
||||||
def add_parallel_residual(self, use: bool) -> None:
|
def add_parallel_residual(self, use: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
|
def add_decoder_start_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
|
||||||
|
|
||||||
def add_head_count(self, count: int) -> None:
|
def add_head_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
@ -448,6 +543,9 @@ class GGUFWriter:
|
||||||
def add_kv_lora_rank(self, length: int) -> None:
|
def add_kv_lora_rank(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
@ -538,6 +636,12 @@ class GGUFWriter:
|
||||||
def add_add_space_prefix(self, value: bool) -> None:
|
def add_add_space_prefix(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||||
|
|
||||||
|
def add_remove_extra_whitespaces(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
|
||||||
|
|
||||||
|
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
|
||||||
|
|
||||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
if not isinstance(value, str):
|
if not isinstance(value, str):
|
||||||
template_default = None
|
template_default = None
|
||||||
|
@ -599,9 +703,12 @@ class GGUFWriter:
|
||||||
kv_data += self._pack("Q", len(encoded_val))
|
kv_data += self._pack("Q", len(encoded_val))
|
||||||
kv_data += encoded_val
|
kv_data += encoded_val
|
||||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
||||||
ltype = GGUFValueType.get_type(val[0])
|
if isinstance(val, bytes):
|
||||||
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
ltype = GGUFValueType.UINT8
|
||||||
raise ValueError("All items in a GGUF array should be of the same type")
|
else:
|
||||||
|
ltype = GGUFValueType.get_type(val[0])
|
||||||
|
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
||||||
|
raise ValueError("All items in a GGUF array should be of the same type")
|
||||||
kv_data += self._pack("I", ltype)
|
kv_data += self._pack("I", ltype)
|
||||||
kv_data += self._pack("Q", len(val))
|
kv_data += self._pack("Q", len(val))
|
||||||
for item in val:
|
for item in val:
|
||||||
|
@ -611,6 +718,13 @@ class GGUFWriter:
|
||||||
|
|
||||||
return kv_data
|
return kv_data
|
||||||
|
|
||||||
def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
|
@staticmethod
|
||||||
assert self.fout is not None
|
def format_n_bytes_to_str(num: int) -> str:
|
||||||
self.fout.write(self._pack(fmt, value, skip_pack_prefix))
|
if num == 0:
|
||||||
|
return "negligible - metadata only"
|
||||||
|
fnum = float(num)
|
||||||
|
for unit in ("", "K", "M", "G"):
|
||||||
|
if abs(fnum) < 1000.0:
|
||||||
|
return f"{fnum:3.1f}{unit}"
|
||||||
|
fnum /= 1000.0
|
||||||
|
return f"{fnum:.1f}T - over 1TB, split recommended"
|
||||||
|
|
|
@ -25,6 +25,7 @@ class TensorNameMap:
|
||||||
"backbone.embeddings", # mamba-hf
|
"backbone.embeddings", # mamba-hf
|
||||||
"transformer.in_out_embed", # Grok
|
"transformer.in_out_embed", # Grok
|
||||||
"embedding.word_embeddings", # chatglm
|
"embedding.word_embeddings", # chatglm
|
||||||
|
"shared", # t5
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
@ -423,6 +424,128 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||||
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.2.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: (
|
||||||
|
"decoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||||
|
"encoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# architecture-specific block mappings
|
# architecture-specific block mappings
|
||||||
|
|
|
@ -208,7 +208,9 @@ def translate_tensor_name(name):
|
||||||
'ssm_d': 'State space model skip connection',
|
'ssm_d': 'State space model skip connection',
|
||||||
'ssm_dt': 'State space model time step',
|
'ssm_dt': 'State space model time step',
|
||||||
'ssm_out': 'State space model output projection',
|
'ssm_out': 'State space model output projection',
|
||||||
'blk': 'Block'
|
'blk': 'Block',
|
||||||
|
'enc': 'Encoder',
|
||||||
|
'dec': 'Decoder',
|
||||||
}
|
}
|
||||||
|
|
||||||
expanded_words = []
|
expanded_words = []
|
||||||
|
@ -291,6 +293,10 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
|
||||||
tensor_group_name = "base"
|
tensor_group_name = "base"
|
||||||
if tensor_components[0] == 'blk':
|
if tensor_components[0] == 'blk':
|
||||||
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
|
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}"
|
||||||
|
elif tensor_components[0] in ['enc', 'dec'] and tensor_components[1] == 'blk':
|
||||||
|
tensor_group_name = f"{tensor_components[0]}.{tensor_components[1]}.{tensor_components[2]}"
|
||||||
|
elif tensor_components[0] in ['enc', 'dec']:
|
||||||
|
tensor_group_name = f"{tensor_components[0]}"
|
||||||
|
|
||||||
# Check if new Tensor Group
|
# Check if new Tensor Group
|
||||||
if tensor_group_name not in tensor_groups:
|
if tensor_group_name not in tensor_groups:
|
||||||
|
|
408
llama.cpp
408
llama.cpp
|
@ -226,6 +226,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_ARCTIC,
|
LLM_ARCH_ARCTIC,
|
||||||
LLM_ARCH_DEEPSEEK2,
|
LLM_ARCH_DEEPSEEK2,
|
||||||
LLM_ARCH_CHATGLM,
|
LLM_ARCH_CHATGLM,
|
||||||
|
LLM_ARCH_BITNET,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -265,6 +266,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||||
{ LLM_ARCH_CHATGLM, "chatglm" },
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
||||||
|
{ LLM_ARCH_BITNET, "bitnet" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -504,6 +506,8 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_KV_B,
|
LLM_TENSOR_ATTN_KV_B,
|
||||||
LLM_TENSOR_ATTN_Q_A_NORM,
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
||||||
LLM_TENSOR_ATTN_KV_A_NORM,
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
||||||
|
LLM_TENSOR_ATTN_SUB_NORM,
|
||||||
|
LLM_TENSOR_FFN_SUB_NORM,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||||
|
@ -1132,6 +1136,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_BITNET,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -2139,6 +2161,8 @@ struct llama_layer {
|
||||||
struct ggml_tensor * attn_out_norm_b;
|
struct ggml_tensor * attn_out_norm_b;
|
||||||
struct ggml_tensor * attn_q_a_norm;
|
struct ggml_tensor * attn_q_a_norm;
|
||||||
struct ggml_tensor * attn_kv_a_norm;
|
struct ggml_tensor * attn_kv_a_norm;
|
||||||
|
struct ggml_tensor * attn_sub_norm;
|
||||||
|
struct ggml_tensor * ffn_sub_norm;
|
||||||
|
|
||||||
// attention
|
// attention
|
||||||
struct ggml_tensor * wq;
|
struct ggml_tensor * wq;
|
||||||
|
@ -2206,6 +2230,15 @@ struct llama_layer {
|
||||||
// long rope factors
|
// long rope factors
|
||||||
struct ggml_tensor * rope_long = nullptr;
|
struct ggml_tensor * rope_long = nullptr;
|
||||||
struct ggml_tensor * rope_short = nullptr;
|
struct ggml_tensor * rope_short = nullptr;
|
||||||
|
|
||||||
|
// bitnet scale
|
||||||
|
struct ggml_tensor * wq_scale;
|
||||||
|
struct ggml_tensor * wk_scale;
|
||||||
|
struct ggml_tensor * wv_scale;
|
||||||
|
struct ggml_tensor * wo_scale;
|
||||||
|
struct ggml_tensor * ffn_gate_scale;
|
||||||
|
struct ggml_tensor * ffn_up_scale;
|
||||||
|
struct ggml_tensor * ffn_down_scale;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cell {
|
struct llama_kv_cell {
|
||||||
|
@ -2314,6 +2347,8 @@ struct llama_vocab {
|
||||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
||||||
|
int max_token_len = 0; // used for optimizing longest token search
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
std::unordered_map<token, id> token_to_id;
|
||||||
std::vector<token_data> id_to_token;
|
std::vector<token_data> id_to_token;
|
||||||
|
|
||||||
|
@ -4741,6 +4776,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4979,6 +5023,7 @@ static void llm_load_vocab(
|
||||||
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
|
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
||||||
|
|
||||||
auto & token_data = vocab.id_to_token[i];
|
auto & token_data = vocab.id_to_token[i];
|
||||||
token_data.text = std::move(word);
|
token_data.text = std::move(word);
|
||||||
|
@ -5311,6 +5356,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
||||||
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||||
|
@ -6712,6 +6759,44 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_CHATGLM:
|
case LLM_ARCH_CHATGLM:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@ -7396,7 +7481,10 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
|
|
||||||
ggml_build_forward_expand(graph, cur);
|
ggml_build_forward_expand(graph, cur);
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx, wo, cur);
|
if (wo) {
|
||||||
|
cur = ggml_mul_mat(ctx, wo, cur);
|
||||||
|
}
|
||||||
|
|
||||||
if (wo_b) {
|
if (wo_b) {
|
||||||
cb(cur, "kqv_wo", il);
|
cb(cur, "kqv_wo", il);
|
||||||
}
|
}
|
||||||
|
@ -7755,6 +7843,50 @@ struct llm_build_context {
|
||||||
return lctx.inp_s_seq;
|
return lctx.inp_s_seq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
||||||
|
// find result_norm tensor for input
|
||||||
|
struct ggml_tensor * inp = nullptr;
|
||||||
|
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||||
|
inp = gf->nodes[i];
|
||||||
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
inp = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
|
||||||
|
switch (pooling_type) {
|
||||||
|
case LLAMA_POOLING_TYPE_MEAN:
|
||||||
|
{
|
||||||
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
||||||
|
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
|
||||||
|
} break;
|
||||||
|
case LLAMA_POOLING_TYPE_CLS:
|
||||||
|
case LLAMA_POOLING_TYPE_LAST:
|
||||||
|
{
|
||||||
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
||||||
|
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||||
|
} break;
|
||||||
|
case LLAMA_POOLING_TYPE_NONE:
|
||||||
|
{
|
||||||
|
cur = inp;
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false && "unknown pooling type");
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(cur, "result_embd_pooled", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_llama() {
|
struct ggml_cgraph * build_llama() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -8735,8 +8867,6 @@ struct llm_build_context {
|
||||||
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
||||||
inp_pos = build_inp_pos();
|
inp_pos = build_inp_pos();
|
||||||
}
|
}
|
||||||
struct ggml_tensor * inp_mean = build_inp_mean();
|
|
||||||
struct ggml_tensor * inp_cls = build_inp_cls();
|
|
||||||
|
|
||||||
// construct input embeddings (token, type, position)
|
// construct input embeddings (token, type, position)
|
||||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
@ -8911,28 +9041,6 @@ struct llm_build_context {
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
cb(cur, "result_embd", -1);
|
cb(cur, "result_embd", -1);
|
||||||
|
|
||||||
// pooling layer
|
|
||||||
switch (pooling_type) {
|
|
||||||
case LLAMA_POOLING_TYPE_NONE:
|
|
||||||
{
|
|
||||||
// nop
|
|
||||||
} break;
|
|
||||||
case LLAMA_POOLING_TYPE_MEAN:
|
|
||||||
{
|
|
||||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
|
||||||
cb(cur, "result_embd_pooled", -1);
|
|
||||||
} break;
|
|
||||||
case LLAMA_POOLING_TYPE_CLS:
|
|
||||||
{
|
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
|
||||||
cb(cur, "result_embd_pooled", -1);
|
|
||||||
} break;
|
|
||||||
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(false && "Invalid pooling type");
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
|
@ -11790,6 +11898,148 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_bitnet() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// B1.K
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// B1.V
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
model.layers[il].attn_sub_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_sub_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||||
|
if (model.layers[il].bo) {
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||||
|
}
|
||||||
|
cb(cur, "attn_o_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward forward
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
|
||||||
|
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale);
|
||||||
|
cb(tmp, "ffn_up", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale);
|
||||||
|
cb(cur, "ffn_gate", il);
|
||||||
|
|
||||||
|
cur = ggml_silu(ctx0, cur);
|
||||||
|
cb(cur, "ffn_silu", il);
|
||||||
|
|
||||||
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
|
cb(cur, "ffn_gate_par", il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].ffn_sub_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_sub_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
|
||||||
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||||
|
cb(cur, "ffn_down", il);
|
||||||
|
}
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_chatglm() {
|
struct ggml_cgraph * build_chatglm() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -11903,6 +12153,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||||
|
@ -12130,10 +12381,19 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_chatglm();
|
result = llm.build_chatglm();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
|
{
|
||||||
|
result = llm.build_bitnet();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add on pooling layer
|
||||||
|
if (lctx.cparams.embeddings) {
|
||||||
|
result = llm.append_pooling(result);
|
||||||
|
}
|
||||||
|
|
||||||
llm.free();
|
llm.free();
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -12223,7 +12483,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
// (!a || b) is a logical implication (a -> b)
|
// (!a || b) is a logical implication (a -> b)
|
||||||
// !hparams.causal_attn -> !cparams.causal_attn
|
// !hparams.causal_attn -> !cparams.causal_attn
|
||||||
(hparams.causal_attn || !cparams.causal_attn) &&
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
||||||
"causal attention with embedding models is not supported"
|
"causal attention is not supported by this model"
|
||||||
);
|
);
|
||||||
|
|
||||||
if (lctx.inp_KQ_mask) {
|
if (lctx.inp_KQ_mask) {
|
||||||
|
@ -12355,6 +12615,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
||||||
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
GGML_ASSERT(lctx.inp_cls);
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||||
|
|
||||||
|
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
||||||
|
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
||||||
|
|
||||||
|
std::vector<int> last_pos(n_tokens, -1);
|
||||||
|
std::vector<int> last_row(n_tokens, -1);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
|
const llama_pos pos = batch.pos[i];
|
||||||
|
|
||||||
|
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
|
||||||
|
|
||||||
|
if (pos >= last_pos[seq_id]) {
|
||||||
|
last_pos[seq_id] = pos;
|
||||||
|
last_row[seq_id] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
|
if (last_row[i] >= 0) {
|
||||||
|
data[i] = last_row[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (kv_self.recurrent) {
|
if (kv_self.recurrent) {
|
||||||
const int64_t n_kv = kv_self.n;
|
const int64_t n_kv = kv_self.n;
|
||||||
|
|
||||||
|
@ -12416,8 +12707,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
|
|
||||||
// TODO: use a per-batch flag for logits presence instead
|
// TODO: use a per-batch flag for logits presence instead
|
||||||
const bool has_logits = cparams.causal_attn;
|
const bool has_logits = !cparams.embeddings;
|
||||||
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||||
|
|
||||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||||
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
||||||
|
@ -12547,11 +12838,13 @@ static int llama_decode_internal(
|
||||||
std::vector<std::vector<llama_seq_id>> seq_id;
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
||||||
|
|
||||||
// count outputs
|
// count outputs
|
||||||
if (batch_all.logits) {
|
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
n_outputs = n_tokens_all;
|
||||||
|
} else if (batch_all.logits) {
|
||||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||||
n_outputs += batch_all.logits[i] != 0;
|
n_outputs += batch_all.logits[i] != 0;
|
||||||
}
|
}
|
||||||
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
|
} else if (lctx.logits_all) {
|
||||||
n_outputs = n_tokens_all;
|
n_outputs = n_tokens_all;
|
||||||
} else {
|
} else {
|
||||||
// keep last output only
|
// keep last output only
|
||||||
|
@ -12682,30 +12975,13 @@ static int llama_decode_internal(
|
||||||
// no output
|
// no output
|
||||||
res = nullptr;
|
res = nullptr;
|
||||||
embd = nullptr;
|
embd = nullptr;
|
||||||
} else if (!hparams.causal_attn) {
|
|
||||||
res = nullptr; // do not extract logits for embedding models such as BERT
|
|
||||||
|
|
||||||
// token or sequence embeddings
|
|
||||||
embd = gf->nodes[gf->n_nodes - 1];
|
|
||||||
|
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
|
||||||
} else if (cparams.embeddings) {
|
} else if (cparams.embeddings) {
|
||||||
// the embeddings could be in the second to last tensor, or any of the previous tensors
|
res = nullptr; // do not extract logits for embedding case
|
||||||
int i_embd = gf->n_nodes - 2;
|
embd = gf->nodes[gf->n_nodes - 1];
|
||||||
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
|
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||||
i_embd = gf->n_nodes - i;
|
embd = gf->nodes[gf->n_nodes - 2];
|
||||||
if (i_embd < 0) { break; }
|
|
||||||
embd = gf->nodes[i_embd];
|
|
||||||
}
|
|
||||||
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
|
||||||
|
|
||||||
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
|
||||||
if (!cparams.causal_attn) {
|
|
||||||
res = nullptr; // do not extract logits when not needed
|
|
||||||
// skip computing logits
|
|
||||||
// TODO: is this safe?
|
|
||||||
gf->n_nodes = i_embd + 1;
|
|
||||||
}
|
}
|
||||||
|
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||||
} else {
|
} else {
|
||||||
embd = nullptr; // do not extract embeddings when not needed
|
embd = nullptr; // do not extract embeddings when not needed
|
||||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
||||||
|
@ -12728,12 +13004,6 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
|
||||||
// print timing information per ggml operation (for debugging purposes)
|
|
||||||
// requires GGML_PERF to be defined
|
|
||||||
ggml_graph_print(gf);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// plot the computation graph in dot format (for debugging purposes)
|
// plot the computation graph in dot format (for debugging purposes)
|
||||||
//if (n_past%100 == 0) {
|
//if (n_past%100 == 0) {
|
||||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||||
|
@ -12774,11 +13044,10 @@ static int llama_decode_internal(
|
||||||
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_POOLING_TYPE_CLS:
|
|
||||||
case LLAMA_POOLING_TYPE_MEAN:
|
case LLAMA_POOLING_TYPE_MEAN:
|
||||||
|
case LLAMA_POOLING_TYPE_CLS:
|
||||||
|
case LLAMA_POOLING_TYPE_LAST:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
|
||||||
|
|
||||||
// extract sequence embeddings
|
// extract sequence embeddings
|
||||||
auto & embd_seq_out = lctx.embd_seq;
|
auto & embd_seq_out = lctx.embd_seq;
|
||||||
embd_seq_out.clear();
|
embd_seq_out.clear();
|
||||||
|
@ -13676,7 +13945,7 @@ private:
|
||||||
struct llm_tokenizer_wpm {
|
struct llm_tokenizer_wpm {
|
||||||
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
|
||||||
const auto & token_map = vocab.token_to_id;
|
const auto & token_map = vocab.token_to_id;
|
||||||
|
|
||||||
// normalize and split by whitespace
|
// normalize and split by whitespace
|
||||||
|
@ -13685,7 +13954,7 @@ struct llm_tokenizer_wpm {
|
||||||
// bos token prepended already
|
// bos token prepended already
|
||||||
|
|
||||||
// find the longest tokens that form the words
|
// find the longest tokens that form the words
|
||||||
for (const std::string &word : words) {
|
for (const std::string & word : words) {
|
||||||
// skip empty words
|
// skip empty words
|
||||||
if (word.size() == 0) {
|
if (word.size() == 0) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -13702,7 +13971,7 @@ struct llm_tokenizer_wpm {
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
// loop through possible match length
|
// loop through possible match length
|
||||||
bool match = false;
|
bool match = false;
|
||||||
for (int j = n; j > i; j--) {
|
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
|
||||||
auto it = token_map.find(word1.substr(i, j - i));
|
auto it = token_map.find(word1.substr(i, j - i));
|
||||||
if (it != token_map.end()) {
|
if (it != token_map.end()) {
|
||||||
output.push_back(it->second);
|
output.push_back(it->second);
|
||||||
|
@ -13725,7 +13994,8 @@ struct llm_tokenizer_wpm {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> preprocess(const std::string & text) {
|
// TODO: reduce string copies by using cpts_offs array
|
||||||
|
std::vector<std::string> preprocess(const std::string & text) const {
|
||||||
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
||||||
std::vector<std::string> words(1, "");
|
std::vector<std::string> words(1, "");
|
||||||
|
|
||||||
|
@ -14042,6 +14312,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
output.push_back(vocab.special_cls_id);
|
output.push_back(vocab.special_cls_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llm_tokenizer_wpm tokenizer(vocab);
|
||||||
|
|
||||||
for (const auto & fragment : fragment_buffer) {
|
for (const auto & fragment : fragment_buffer) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
@ -14049,7 +14321,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
||||||
#endif
|
#endif
|
||||||
llm_tokenizer_wpm tokenizer(vocab);
|
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
|
@ -16964,6 +17235,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
case LLM_ARCH_STABLELM:
|
case LLM_ARCH_STABLELM:
|
||||||
|
case LLM_ARCH_BITNET:
|
||||||
case LLM_ARCH_QWEN:
|
case LLM_ARCH_QWEN:
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
case LLM_ARCH_QWEN2MOE:
|
case LLM_ARCH_QWEN2MOE:
|
||||||
|
@ -18367,6 +18639,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
|
||||||
ctx->abort_callback_data = abort_callback_data;
|
ctx->abort_callback_data = abort_callback_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
||||||
|
ctx->cparams.embeddings = embeddings;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||||
ctx->cparams.causal_attn = causal_attn;
|
ctx->cparams.causal_attn = causal_attn;
|
||||||
}
|
}
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -176,6 +176,7 @@ extern "C" {
|
||||||
LLAMA_POOLING_TYPE_NONE = 0,
|
LLAMA_POOLING_TYPE_NONE = 0,
|
||||||
LLAMA_POOLING_TYPE_MEAN = 1,
|
LLAMA_POOLING_TYPE_MEAN = 1,
|
||||||
LLAMA_POOLING_TYPE_CLS = 2,
|
LLAMA_POOLING_TYPE_CLS = 2,
|
||||||
|
LLAMA_POOLING_TYPE_LAST = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
|
@ -295,7 +296,6 @@ extern "C" {
|
||||||
|
|
||||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||||
// (ignored if no pooling layer)
|
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
|
@ -789,6 +789,10 @@ extern "C" {
|
||||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Set whether the model is in embeddings mode or not
|
||||||
|
// If true, embeddings will be returned but logits will not
|
||||||
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||||
|
|
||||||
// Set whether to use causal attention or not
|
// Set whether to use causal attention or not
|
||||||
// If set to true, the model will only attend to the past tokens
|
// If set to true, the model will only attend to the past tokens
|
||||||
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
-r ./requirements-convert-legacy-llama.txt
|
-r ./requirements-convert-legacy-llama.txt
|
||||||
torch~=2.1.1
|
torch~=2.2.1
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
-r ./requirements-convert-legacy-llama.txt
|
-r ./requirements-convert-legacy-llama.txt
|
||||||
torch~=2.1.1
|
torch~=2.2.1
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
numpy~=1.24.4
|
numpy~=1.26.4
|
||||||
sentencepiece~=0.2.0
|
sentencepiece~=0.2.0
|
||||||
transformers>=4.40.1,<5.0.0
|
transformers>=4.40.1,<5.0.0
|
||||||
gguf>=0.1.0
|
gguf>=0.1.0
|
||||||
|
|
43
sgemm.cpp
43
sgemm.cpp
|
@ -249,9 +249,8 @@ class tinyBLAS {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
mnpack(0, m, 0, n);
|
||||||
mnpack(0, m, 0, n);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -458,9 +457,8 @@ class tinyBLAS_Q0_ARM {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
mnpack(0, m, 0, n);
|
||||||
mnpack(0, m, 0, n);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -596,9 +594,8 @@ class tinyBLAS_Q0_AVX {
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n, int task) {
|
void matmul(int64_t m, int64_t n) {
|
||||||
if (task == GGML_TASK_TYPE_COMPUTE)
|
mnpack(0, m, 0, n);
|
||||||
mnpack(0, m, 0, n);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -829,7 +826,7 @@ class tinyBLAS_Q0_AVX {
|
||||||
* For example, for single-threaded single-precision GEMM you can say
|
* For example, for single-threaded single-precision GEMM you can say
|
||||||
*
|
*
|
||||||
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
|
||||||
* 0, 1, GGML_TASK_TYPE_COMPUTE,
|
* 0, 1,
|
||||||
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||||
*
|
*
|
||||||
* @param m is rows in `A` and `C`
|
* @param m is rows in `A` and `C`
|
||||||
|
@ -843,14 +840,13 @@ class tinyBLAS_Q0_AVX {
|
||||||
* @param ldc is row stride of `C`
|
* @param ldc is row stride of `C`
|
||||||
* @param ith is thread id (must be less than `nth`)
|
* @param ith is thread id (must be less than `nth`)
|
||||||
* @param nth is number of threads (must be greater than zero)
|
* @param nth is number of threads (must be greater than zero)
|
||||||
* @param task is GGML task type
|
|
||||||
* @param Atype is GGML data type of `A`
|
* @param Atype is GGML data type of `A`
|
||||||
* @param Btype is GGML data type of `B`
|
* @param Btype is GGML data type of `B`
|
||||||
* @param Ctype is GGML data type of `C`
|
* @param Ctype is GGML data type of `C`
|
||||||
* @return true if this function was able to service the matmul request
|
* @return true if this function was able to service the matmul request
|
||||||
*/
|
*/
|
||||||
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||||
int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
|
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
||||||
|
|
||||||
assert(m >= 0);
|
assert(m >= 0);
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
|
@ -877,7 +873,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__AVX__) || defined(__AVX2__)
|
#elif defined(__AVX__) || defined(__AVX2__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
|
@ -887,7 +883,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
if (n < 4)
|
if (n < 4)
|
||||||
|
@ -899,7 +895,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -917,7 +913,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||||
if (k % 8)
|
if (k % 8)
|
||||||
|
@ -929,7 +925,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||||
if (n < 8)
|
if (n < 8)
|
||||||
|
@ -943,7 +939,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const ggml_fp16_t *)B, ldb,
|
(const ggml_fp16_t *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
if (k % 4)
|
if (k % 4)
|
||||||
|
@ -955,7 +951,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const float *)B, ldb,
|
(const float *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -971,7 +967,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
tinyBLAS_Q0_ARM<block_q8_0> tb{
|
||||||
|
@ -979,7 +975,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -995,7 +991,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||||
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
tinyBLAS_Q0_ARM<block_q4_0> tb{
|
||||||
|
@ -1003,7 +999,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
ith, nth};
|
ith, nth};
|
||||||
tb.matmul(m, n, task);
|
tb.matmul(m, n);
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return false;
|
return false;
|
||||||
|
@ -1025,7 +1021,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
(void)ldc;
|
(void)ldc;
|
||||||
(void)ith;
|
(void)ith;
|
||||||
(void)nth;
|
(void)nth;
|
||||||
(void)task;
|
|
||||||
(void)Atype;
|
(void)Atype;
|
||||||
(void)Btype;
|
(void)Btype;
|
||||||
(void)Ctype;
|
(void)Ctype;
|
||||||
|
|
2
sgemm.h
2
sgemm.h
|
@ -7,7 +7,7 @@ extern "C" {
|
||||||
|
|
||||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||||
const void *, int64_t, void *, int64_t, int, int,
|
const void *, int64_t, void *, int64_t, int, int,
|
||||||
int, int, int, int);
|
int, int, int);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -785,6 +785,10 @@ struct test_cpy : public test_case {
|
||||||
return VARS_TO_STR3(type_src, type_dst, ne);
|
return VARS_TO_STR3(type_src, type_dst, ne);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double max_nmse_err() override {
|
||||||
|
return 1e-6;
|
||||||
|
}
|
||||||
|
|
||||||
size_t op_size(ggml_tensor * t) override {
|
size_t op_size(ggml_tensor * t) override {
|
||||||
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,11 +7,16 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
#include "json-schema-to-grammar.h"
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
//#define INCLUDE_FAILING_TESTS 1
|
||||||
|
|
||||||
static llama_grammar* build_grammar(const std::string & grammar_str) {
|
static llama_grammar* build_grammar(const std::string & grammar_str) {
|
||||||
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
||||||
|
|
||||||
|
@ -65,8 +70,8 @@ static bool match_string(const std::string & input, llama_grammar* grammar) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
|
static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
|
||||||
fprintf(stderr, "⚫ Testing %s. Grammar: %s\n", test_desc.c_str(), grammar_str.c_str());
|
fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
auto grammar = build_grammar(grammar_str);
|
auto grammar = build_grammar(grammar_str);
|
||||||
|
@ -85,6 +90,23 @@ static void test_grammar(const std::string & test_desc, const std::string & gram
|
||||||
|
|
||||||
if (!matched) {
|
if (!matched) {
|
||||||
fprintf(stderr, "❌ (failed to match)\n");
|
fprintf(stderr, "❌ (failed to match)\n");
|
||||||
|
|
||||||
|
// DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
|
||||||
|
// DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
|
||||||
|
FILE* grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
|
||||||
|
if (grammar_file) {
|
||||||
|
fprintf(grammar_file, "%s", grammar_str.c_str());
|
||||||
|
fclose(grammar_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
// DEBUG: Write the test string to test-grammar-integration.string.txt
|
||||||
|
FILE* string_file = fopen("test-grammar-integration.string.txt", "w");
|
||||||
|
if (string_file) {
|
||||||
|
fprintf(string_file, "%s", test_string.c_str());
|
||||||
|
fclose(string_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "✅︎\n");
|
fprintf(stdout, "✅︎\n");
|
||||||
}
|
}
|
||||||
|
@ -118,6 +140,12 @@ static void test_grammar(const std::string & test_desc, const std::string & gram
|
||||||
// Clean up allocated memory
|
// Clean up allocated memory
|
||||||
llama_grammar_free(grammar);
|
llama_grammar_free(grammar);
|
||||||
}
|
}
|
||||||
|
static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
|
||||||
|
test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
|
||||||
|
}
|
||||||
|
static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
|
||||||
|
test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str)), passing_strings, failing_strings);
|
||||||
|
}
|
||||||
|
|
||||||
static void test_simple_grammar() {
|
static void test_simple_grammar() {
|
||||||
// Test case for a simple grammar
|
// Test case for a simple grammar
|
||||||
|
@ -400,10 +428,11 @@ static void test_quantifiers() {
|
||||||
static void test_failure_missing_root() {
|
static void test_failure_missing_root() {
|
||||||
fprintf(stderr, "⚫ Testing missing root node:\n");
|
fprintf(stderr, "⚫ Testing missing root node:\n");
|
||||||
// Test case for a grammar that is missing a root rule
|
// Test case for a grammar that is missing a root rule
|
||||||
const std::string grammar_str = R"""(rot ::= expr
|
const std::string grammar_str = R"""(
|
||||||
expr ::= term ("+" term)*
|
rot ::= expr
|
||||||
term ::= number
|
expr ::= term ("+" term)*
|
||||||
number ::= [0-9]+)""";
|
term ::= number
|
||||||
|
number ::= [0-9]+)""";
|
||||||
|
|
||||||
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
||||||
|
|
||||||
|
@ -420,10 +449,10 @@ static void test_failure_missing_reference() {
|
||||||
|
|
||||||
// Test case for a grammar that is missing a referenced rule
|
// Test case for a grammar that is missing a referenced rule
|
||||||
const std::string grammar_str =
|
const std::string grammar_str =
|
||||||
R"""(root ::= expr
|
R"""(root ::= expr
|
||||||
expr ::= term ("+" term)*
|
expr ::= term ("+" term)*
|
||||||
term ::= numero
|
term ::= numero
|
||||||
number ::= [0-9]+)""";
|
number ::= [0-9]+)""";
|
||||||
|
|
||||||
fprintf(stderr, " Expected error: ");
|
fprintf(stderr, " Expected error: ");
|
||||||
|
|
||||||
|
@ -445,29 +474,558 @@ static void test_failure_left_recursion() {
|
||||||
|
|
||||||
// Test more complicated left recursion detection
|
// Test more complicated left recursion detection
|
||||||
const std::string medium_str = R"""(
|
const std::string medium_str = R"""(
|
||||||
root ::= asdf
|
root ::= asdf
|
||||||
asdf ::= "a" | asdf "a"
|
asdf ::= "a" | asdf "a"
|
||||||
)""";
|
)""";
|
||||||
assert(test_build_grammar_fails(medium_str));
|
assert(test_build_grammar_fails(medium_str));
|
||||||
|
|
||||||
// Test even more complicated left recursion detection
|
// Test even more complicated left recursion detection
|
||||||
const std::string hard_str = R"""(
|
const std::string hard_str = R"""(
|
||||||
root ::= asdf
|
root ::= asdf
|
||||||
asdf ::= "a" | foo "b"
|
asdf ::= "a" | foo "b"
|
||||||
foo ::= "c" | asdf "d" | "e")""";
|
foo ::= "c" | asdf "d" | "e")""";
|
||||||
assert(test_build_grammar_fails(hard_str));
|
assert(test_build_grammar_fails(hard_str));
|
||||||
|
|
||||||
// Test yet even more complicated left recursion detection
|
// Test yet even more complicated left recursion detection
|
||||||
const std::string hardest_str = R"""(
|
const std::string hardest_str = R"""(
|
||||||
root ::= asdf
|
root ::= asdf
|
||||||
asdf ::= "a" | foo "b"
|
asdf ::= "a" | foo "b"
|
||||||
foo ::= "c" | empty asdf "d" | "e"
|
foo ::= "c" | empty asdf "d" | "e"
|
||||||
empty ::= "blah" | )""";
|
empty ::= "blah" | )""";
|
||||||
assert(test_build_grammar_fails(hardest_str));
|
assert(test_build_grammar_fails(hardest_str));
|
||||||
|
|
||||||
fprintf(stderr, " ✅︎ Passed\n");
|
fprintf(stderr, " ✅︎ Passed\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_json_schema() {
|
||||||
|
// Note that this is similar to the regular grammar tests,
|
||||||
|
// but we convert each json schema to a grammar before parsing.
|
||||||
|
// Otherwise, this test structure is the same.
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"empty schema (object)",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"{}",
|
||||||
|
R"""({"foo": "bar"})""",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"",
|
||||||
|
"[]",
|
||||||
|
"null",
|
||||||
|
"\"\"",
|
||||||
|
"true",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"exotic formats (list)",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"items": [
|
||||||
|
{ "format": "date" },
|
||||||
|
{ "format": "uuid" },
|
||||||
|
{ "format": "time" },
|
||||||
|
{ "format": "date-time" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
// "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
|
||||||
|
// "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
|
||||||
|
R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
|
||||||
|
//R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
|
||||||
|
//R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
R"""(["foo", "bar"])""",
|
||||||
|
R"""(["12345678-1234-1234-1234-1234567890ab"])""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
"\"\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"{}",
|
||||||
|
"\"foo\": \"bar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string w/ min length 1",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"\"",
|
||||||
|
"{}",
|
||||||
|
"\"foo\": \"bar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string w/ min length 3",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 3
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
"\"foobar\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"\"",
|
||||||
|
"\"f\"",
|
||||||
|
"\"fo\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string w/ max length",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"maxLength": 3
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
"\"\"",
|
||||||
|
"\"f\"",
|
||||||
|
"\"fo\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"foobar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string w/ min & max length",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"maxLength": 4
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
"\"bar\"",
|
||||||
|
"\"f\"",
|
||||||
|
"\"barf\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"\"",
|
||||||
|
"\"barfo\"",
|
||||||
|
"\"foobar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"boolean",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"true",
|
||||||
|
"false",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"\"\"",
|
||||||
|
"\"true\"",
|
||||||
|
"True",
|
||||||
|
"FALSE",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"integer",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"0",
|
||||||
|
"12345",
|
||||||
|
"1234567890123456"
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"",
|
||||||
|
"01",
|
||||||
|
"007",
|
||||||
|
"12345678901234567"
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"string const",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"const": "foo"
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"foo\"",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"foo",
|
||||||
|
"\"bar\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"non-string const",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"const": true
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"true",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"",
|
||||||
|
"foo",
|
||||||
|
"\"true\"",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"non-string const",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"enum": ["red", "amber", "green", null, 42, ["foo"]]
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"\"red\"",
|
||||||
|
"null",
|
||||||
|
"42",
|
||||||
|
"[\"foo\"]",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"",
|
||||||
|
"420",
|
||||||
|
"true",
|
||||||
|
"foo",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"min+max items",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"items": {
|
||||||
|
"type": ["number", "integer"]
|
||||||
|
},
|
||||||
|
"minItems": 3,
|
||||||
|
"maxItems": 5
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
"[1, 2, 3]",
|
||||||
|
"[1, 2, 3, 4]",
|
||||||
|
"[1, 2, 3, 4, 5]",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
"[1, 2]",
|
||||||
|
"[1, 2, 3, 4, 5, 6]",
|
||||||
|
"1"
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
|
||||||
|
test_schema(
|
||||||
|
"object properties",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"number": { "type": "number" },
|
||||||
|
"street_name": { "type": "string" },
|
||||||
|
"street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
// "By default, leaving out properties is valid"
|
||||||
|
R"""({ "street_name": "Pennsylvania" })""",
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
|
||||||
|
// "By extension, even an empty object is valid"
|
||||||
|
R"""({})""",
|
||||||
|
// "By default, providing additional properties is valid"
|
||||||
|
#ifdef INCLUDE_FAILING_TESTS
|
||||||
|
// TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
|
||||||
|
// TODO: Spaces should be permitted around enum values, but currently they fail to pass.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
|
||||||
|
#endif
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
// Change datatype from number to string
|
||||||
|
R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
// Reorder properties
|
||||||
|
R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
|
||||||
|
// Reorder properties
|
||||||
|
R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
|
||||||
|
test_schema(
|
||||||
|
"object properties, additionalProperties: true",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"number": { "type": "number" },
|
||||||
|
"street_name": { "type": "string" },
|
||||||
|
"street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
|
||||||
|
},
|
||||||
|
"additionalProperties": true
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
// "By extension, even an empty object is valid"
|
||||||
|
R"""({})""",
|
||||||
|
#ifdef INCLUDE_FAILING_TESTS
|
||||||
|
// TODO: Following line should pass and doesn't
|
||||||
|
R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
|
||||||
|
// "By default, leaving out properties is valid"
|
||||||
|
// TODO: Following line should pass and doesn't
|
||||||
|
R"""({ "street_name": "Pennsylvania" })""",
|
||||||
|
// TODO: Following line should pass and doesn't
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
|
||||||
|
// "By default, providing additional properties is valid"
|
||||||
|
// TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
|
||||||
|
// TODO: Spaces should be permitted around enum values, but currently they fail to pass.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
|
||||||
|
#endif
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
// Change datatype from number to string
|
||||||
|
R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
// Reorder properties
|
||||||
|
R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Additional properties: false
|
||||||
|
test_schema(
|
||||||
|
"required + optional props each in original order",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"number": { "type": "number" },
|
||||||
|
"street_name": { "type": "string" },
|
||||||
|
"street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
|
||||||
|
},
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""({ "street_name": "Pennsylvania" })""",
|
||||||
|
R"""({ "number": 1600, "street_type":"Avenue"})""",
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
#ifdef INCLUDE_FAILING_TESTS
|
||||||
|
// TODO: Spaces should be permitted around enum values, but currently they fail to pass.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
|
||||||
|
#endif
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
// Reorder properties
|
||||||
|
R"""({ "street_type": "Avenue", "number": 1600 })""",
|
||||||
|
// Add "direction"
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"required + optional props each in original order",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"properties": {
|
||||||
|
"b": {"type": "string"},
|
||||||
|
"a": {"type": "string"},
|
||||||
|
"d": {"type": "string"},
|
||||||
|
"c": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["a", "b"],
|
||||||
|
"additionalProperties": false
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""({"b": "foo", "a": "bar"})""",
|
||||||
|
R"""({"b":"foo","a":"bar","d":"qux"})""",
|
||||||
|
R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
R"""({"a": "foo", "b": "bar"})""",
|
||||||
|
R"""({"b": "bar"})""",
|
||||||
|
R"""({"a": "foo", "c": "baz"})""",
|
||||||
|
R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
|
||||||
|
test_schema(
|
||||||
|
"required props",
|
||||||
|
// Schema
|
||||||
|
R"""(
|
||||||
|
{
|
||||||
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||||
|
"$id": "https://example.com/product.schema.json",
|
||||||
|
"title": "Product",
|
||||||
|
"description": "A product from Acme's catalog",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"productId": {
|
||||||
|
"description": "The unique identifier for a product",
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"productName": {
|
||||||
|
"description": "Name of the product",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"price": {
|
||||||
|
"description": "The price of the product",
|
||||||
|
"type": "number",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"tags": {
|
||||||
|
"description": "Tags for the product",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"minItems": 1,
|
||||||
|
"uniqueItems": true
|
||||||
|
},
|
||||||
|
"dimensions": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"length": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"width": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"height": {
|
||||||
|
"type": "number"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [ "length", "width", "height" ]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [ "productId", "productName", "price" ]
|
||||||
|
}
|
||||||
|
)""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
R"""({})""", // Missing all required properties
|
||||||
|
R"""({"productName": "A green door", "price": 12.50, "productId": 1})""", // Out of order properties
|
||||||
|
// TODO: The following line should fail, but currently it passes. `exclusiveMinimum` is not supported, as it would likely be too difficult to implement.
|
||||||
|
// Perhaps special checks for minimum and maximum values of 0 could be added (since that's relatively easy to do with grammars), but anything else would likely be too complex.
|
||||||
|
// R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
|
||||||
|
R"""({"productId": 1, "productName": "A green door"})""", // Missing required property (price)
|
||||||
|
R"""({"productName": "A green door", "price": 12.50})""", // Missing required property (productId)
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""", // tags is empty, but minItems is 1
|
||||||
|
R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""", // Tags and dimensions are out of order
|
||||||
|
// TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
|
||||||
|
// R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
fprintf(stdout, "Running grammar integration tests...\n");
|
fprintf(stdout, "Running grammar integration tests...\n");
|
||||||
test_simple_grammar();
|
test_simple_grammar();
|
||||||
|
@ -477,6 +1035,7 @@ int main() {
|
||||||
test_failure_missing_root();
|
test_failure_missing_root();
|
||||||
test_failure_missing_reference();
|
test_failure_missing_reference();
|
||||||
test_failure_left_recursion();
|
test_failure_left_recursion();
|
||||||
|
test_json_schema();
|
||||||
fprintf(stdout, "All tests passed.\n");
|
fprintf(stdout, "All tests passed.\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
||||||
|
|
||||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||||
std::vector<uint32_t> result;
|
std::vector<uint32_t> result;
|
||||||
|
result.reserve(utf8.size());
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
while (offset < utf8.size()) {
|
while (offset < utf8.size()) {
|
||||||
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
||||||
|
|
|
@ -13,7 +13,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
|
@ -7,7 +7,7 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue