Merge branch 'ggerganov:master' into fix-cmake-pkg-missing-deps
This commit is contained in:
commit
0e69f27dfb
18 changed files with 1045 additions and 470 deletions
161
.clang-format
Normal file
161
.clang-format
Normal file
|
@ -0,0 +1,161 @@
|
||||||
|
---
|
||||||
|
Language: Cpp
|
||||||
|
AlignAfterOpenBracket: Align
|
||||||
|
AlignArrayOfStructures: Left
|
||||||
|
AlignConsecutiveAssignments: AcrossComments
|
||||||
|
AlignConsecutiveBitFields: AcrossComments
|
||||||
|
AlignConsecutiveDeclarations: AcrossComments
|
||||||
|
AlignConsecutiveMacros: AcrossComments
|
||||||
|
# AlignConsecutiveShortCaseStatements: AcrossComments
|
||||||
|
AlignEscapedNewlines: Left # LeftWithLastLine
|
||||||
|
AlignOperands: Align
|
||||||
|
AlignTrailingComments:
|
||||||
|
Kind: Always
|
||||||
|
OverEmptyLines: 1
|
||||||
|
AllowAllArgumentsOnNextLine: true
|
||||||
|
AllowAllParametersOfDeclarationOnNextLine: false
|
||||||
|
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
|
||||||
|
AllowShortBlocksOnASingleLine: Never
|
||||||
|
AllowShortCaseLabelsOnASingleLine: false
|
||||||
|
AllowShortFunctionsOnASingleLine: Inline
|
||||||
|
AllowShortIfStatementsOnASingleLine: Never
|
||||||
|
AllowShortLambdasOnASingleLine: Inline
|
||||||
|
AllowShortLoopsOnASingleLine: false
|
||||||
|
AlwaysBreakBeforeMultilineStrings: true
|
||||||
|
BinPackArguments: true
|
||||||
|
BinPackParameters: true # OnePerLine
|
||||||
|
BitFieldColonSpacing: Both
|
||||||
|
BreakBeforeBraces: Custom # Attach
|
||||||
|
BraceWrapping:
|
||||||
|
AfterCaseLabel: true
|
||||||
|
AfterClass: false
|
||||||
|
AfterControlStatement: false
|
||||||
|
AfterEnum: false
|
||||||
|
AfterFunction: false
|
||||||
|
AfterNamespace: false
|
||||||
|
AfterObjCDeclaration: false
|
||||||
|
AfterStruct: false
|
||||||
|
AfterUnion: false
|
||||||
|
AfterExternBlock: false
|
||||||
|
BeforeCatch: false
|
||||||
|
BeforeElse: false
|
||||||
|
BeforeLambdaBody: false
|
||||||
|
BeforeWhile: false
|
||||||
|
IndentBraces: false
|
||||||
|
SplitEmptyFunction: false
|
||||||
|
SplitEmptyRecord: false
|
||||||
|
SplitEmptyNamespace: false
|
||||||
|
# BreakAdjacentStringLiterals: true
|
||||||
|
BreakAfterAttributes: Never
|
||||||
|
BreakBeforeBinaryOperators: None
|
||||||
|
BreakBeforeInlineASMColon: OnlyMultiline
|
||||||
|
BreakBeforeTernaryOperators: false
|
||||||
|
# BreakBinaryOperations: Never
|
||||||
|
BreakConstructorInitializers: AfterColon
|
||||||
|
# BreakFunctionDefinitionParameters: false
|
||||||
|
BreakInheritanceList: AfterComma
|
||||||
|
BreakStringLiterals: true
|
||||||
|
# BreakTemplateDeclarations: Yes
|
||||||
|
ColumnLimit: 120
|
||||||
|
CommentPragmas: '^ IWYU pragma:'
|
||||||
|
CompactNamespaces: false
|
||||||
|
ConstructorInitializerIndentWidth: 4
|
||||||
|
ContinuationIndentWidth: 4
|
||||||
|
Cpp11BracedListStyle: false
|
||||||
|
DerivePointerAlignment: false
|
||||||
|
DisableFormat: false
|
||||||
|
EmptyLineBeforeAccessModifier: Leave
|
||||||
|
EmptyLineAfterAccessModifier: Never
|
||||||
|
ExperimentalAutoDetectBinPacking: false
|
||||||
|
FixNamespaceComments: true
|
||||||
|
IncludeBlocks: Regroup
|
||||||
|
IncludeCategories:
|
||||||
|
- Regex: '^<.*\.h>'
|
||||||
|
Priority: 1
|
||||||
|
SortPriority: 0
|
||||||
|
- Regex: '^<.*'
|
||||||
|
Priority: 2
|
||||||
|
SortPriority: 0
|
||||||
|
- Regex: '.*'
|
||||||
|
Priority: 3
|
||||||
|
SortPriority: 0
|
||||||
|
IncludeIsMainRegex: '([-_](test|unittest))?$'
|
||||||
|
IncludeIsMainSourceRegex: ''
|
||||||
|
IndentAccessModifiers: false
|
||||||
|
IndentCaseBlocks: true
|
||||||
|
IndentCaseLabels: true
|
||||||
|
IndentExternBlock: NoIndent
|
||||||
|
IndentGotoLabels: false
|
||||||
|
IndentPPDirectives: AfterHash
|
||||||
|
IndentWidth: 4
|
||||||
|
IndentWrappedFunctionNames: false
|
||||||
|
InsertBraces: true # NOTE: may lead to incorrect formatting
|
||||||
|
InsertNewlineAtEOF: true
|
||||||
|
JavaScriptQuotes: Leave
|
||||||
|
JavaScriptWrapImports: true
|
||||||
|
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||||
|
LambdaBodyIndentation: Signature
|
||||||
|
LineEnding: LF
|
||||||
|
MacroBlockBegin: ''
|
||||||
|
MacroBlockEnd: ''
|
||||||
|
MaxEmptyLinesToKeep: 1
|
||||||
|
NamespaceIndentation: None
|
||||||
|
ObjCBinPackProtocolList: Auto
|
||||||
|
ObjCBlockIndentWidth: 4
|
||||||
|
ObjCSpaceAfterProperty: true
|
||||||
|
ObjCSpaceBeforeProtocolList: true
|
||||||
|
PPIndentWidth: -1
|
||||||
|
PackConstructorInitializers: CurrentLine
|
||||||
|
PenaltyBreakAssignment: 2
|
||||||
|
PenaltyBreakBeforeFirstCallParameter: 1
|
||||||
|
PenaltyBreakComment: 300
|
||||||
|
PenaltyBreakFirstLessLess: 120
|
||||||
|
PenaltyBreakString: 1000
|
||||||
|
PenaltyBreakTemplateDeclaration: 10
|
||||||
|
PenaltyExcessCharacter: 1000000
|
||||||
|
PenaltyReturnTypeOnItsOwnLine: 200
|
||||||
|
PointerAlignment: Middle
|
||||||
|
QualifierAlignment: Left
|
||||||
|
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
|
||||||
|
RawStringFormats:
|
||||||
|
- Language: Cpp
|
||||||
|
Delimiters:
|
||||||
|
- cc
|
||||||
|
- CC
|
||||||
|
- cpp
|
||||||
|
- Cpp
|
||||||
|
- CPP
|
||||||
|
- 'c++'
|
||||||
|
- 'C++'
|
||||||
|
CanonicalDelimiter: ''
|
||||||
|
ReferenceAlignment: Middle
|
||||||
|
ReflowComments: false # IndentOnly
|
||||||
|
SeparateDefinitionBlocks: Always
|
||||||
|
SortIncludes: CaseInsensitive
|
||||||
|
SortUsingDeclarations: LexicographicNumeric
|
||||||
|
SpaceAfterCStyleCast: true
|
||||||
|
SpaceAfterLogicalNot: false
|
||||||
|
SpaceAfterTemplateKeyword: true
|
||||||
|
SpaceBeforeAssignmentOperators: true
|
||||||
|
SpaceBeforeCpp11BracedList: false
|
||||||
|
SpaceBeforeCtorInitializerColon: true
|
||||||
|
SpaceBeforeInheritanceColon: true
|
||||||
|
SpaceBeforeParens: ControlStatements
|
||||||
|
SpaceBeforeRangeBasedForLoopColon: true
|
||||||
|
SpaceInEmptyBlock: false
|
||||||
|
SpaceInEmptyParentheses: false
|
||||||
|
SpacesBeforeTrailingComments: 2
|
||||||
|
SpacesInAngles: Never
|
||||||
|
SpacesInContainerLiterals: true
|
||||||
|
SpacesInLineCommentPrefix:
|
||||||
|
Minimum: 1
|
||||||
|
Maximum: -1
|
||||||
|
SpacesInParentheses: false
|
||||||
|
SpacesInSquareBrackets: false
|
||||||
|
SpaceBeforeSquareBrackets: false
|
||||||
|
Standard: c++17
|
||||||
|
TabWidth: 4
|
||||||
|
UseTab: Never
|
||||||
|
WhitespaceSensitiveMacros: ['STRINGIZE']
|
||||||
|
...
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -730,10 +730,10 @@ GLSLC_CMD = glslc
|
||||||
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
|
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
|
||||||
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
|
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
|
||||||
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
|
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
|
||||||
_ggml_vk_input_dir = ggml/src/vulkan-shaders
|
_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
|
||||||
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
|
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
|
||||||
|
|
||||||
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
|
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
|
||||||
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
|
||||||
|
|
||||||
$(_ggml_vk_header): $(_ggml_vk_source)
|
$(_ggml_vk_header): $(_ggml_vk_source)
|
||||||
|
@ -745,8 +745,8 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
|
||||||
--target-hpp $(_ggml_vk_header) \
|
--target-hpp $(_ggml_vk_header) \
|
||||||
--target-cpp $(_ggml_vk_source)
|
--target-cpp $(_ggml_vk_source)
|
||||||
|
|
||||||
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
|
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
||||||
|
|
||||||
endif # GGML_VULKAN
|
endif # GGML_VULKAN
|
||||||
|
|
||||||
|
|
|
@ -34,9 +34,10 @@ The SYCL backend would be broken by some PRs due to no online CI.
|
||||||
|
|
||||||
The following release is verified with good quality:
|
The following release is verified with good quality:
|
||||||
|
|
||||||
|Commit ID|Tag|Release|Verified Platform|
|
|Commit ID|Tag|Release|Verified Platform| Update date|
|
||||||
|-|-|-|-|
|
|-|-|-|-|-|
|
||||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||||
|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||||
|
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
|
@ -6,21 +6,21 @@
|
||||||
#include <clocale>
|
#include <clocale>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <cstdlib>
|
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
# define WIN32_LEAN_AND_MEAN
|
# define WIN32_LEAN_AND_MEAN
|
||||||
|
@ -36,8 +36,7 @@ static uint64_t get_time_ns() {
|
||||||
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class T>
|
template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
|
||||||
static std::string join(const std::vector<T> & values, const std::string & delim) {
|
|
||||||
std::ostringstream str;
|
std::ostringstream str;
|
||||||
for (size_t i = 0; i < values.size(); i++) {
|
for (size_t i = 0; i < values.size(); i++) {
|
||||||
str << values[i];
|
str << values[i];
|
||||||
|
@ -48,15 +47,13 @@ static std::string join(const std::vector<T> & values, const std::string & delim
|
||||||
return str.str();
|
return str.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, typename F>
|
template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
||||||
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
|
||||||
std::vector<std::string> str_values;
|
std::vector<std::string> str_values;
|
||||||
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
|
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
|
||||||
return str_values;
|
return str_values;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template <typename T> static T avg(const std::vector<T> & v) {
|
||||||
static T avg(const std::vector<T> & v) {
|
|
||||||
if (v.empty()) {
|
if (v.empty()) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -64,8 +61,7 @@ static T avg(const std::vector<T> & v) {
|
||||||
return sum / (T) v.size();
|
return sum / (T) v.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template <typename T> static T stdev(const std::vector<T> & v) {
|
||||||
static T stdev(const std::vector<T> & v) {
|
|
||||||
if (v.size() <= 1) {
|
if (v.size() <= 1) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -104,13 +100,20 @@ enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
|
||||||
|
|
||||||
static const char * output_format_str(output_formats format) {
|
static const char * output_format_str(output_formats format) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
case NONE: return "none";
|
case NONE:
|
||||||
case CSV: return "csv";
|
return "none";
|
||||||
case JSON: return "json";
|
case CSV:
|
||||||
case JSONL: return "jsonl";
|
return "csv";
|
||||||
case MARKDOWN: return "md";
|
case JSON:
|
||||||
case SQL: return "sql";
|
return "json";
|
||||||
default: GGML_ABORT("invalid output format");
|
case JSONL:
|
||||||
|
return "jsonl";
|
||||||
|
case MARKDOWN:
|
||||||
|
return "md";
|
||||||
|
case SQL:
|
||||||
|
return "sql";
|
||||||
|
default:
|
||||||
|
GGML_ABORT("invalid output format");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
|
||||||
|
|
||||||
static const char * split_mode_str(llama_split_mode mode) {
|
static const char * split_mode_str(llama_split_mode mode) {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
case LLAMA_SPLIT_MODE_NONE:
|
||||||
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
return "none";
|
||||||
case LLAMA_SPLIT_MODE_ROW: return "row";
|
case LLAMA_SPLIT_MODE_LAYER:
|
||||||
default: GGML_ABORT("invalid split mode");
|
return "layer";
|
||||||
|
case LLAMA_SPLIT_MODE_ROW:
|
||||||
|
return "row";
|
||||||
|
default:
|
||||||
|
GGML_ABORT("invalid split mode");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -218,38 +225,59 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help\n");
|
printf(" -h, --help\n");
|
||||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
printf(" -p, --n-prompt <n> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||||
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
printf(" -pg <pp,tg> (default: %s)\n",
|
||||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||||
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
printf(" -b, --batch-size <n> (default: %s)\n",
|
||||||
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
join(cmd_params_defaults.n_batch, ",").c_str());
|
||||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
join(cmd_params_defaults.n_ubatch, ",").c_str());
|
||||||
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
printf(" -ctk, --cache-type-k <t> (default: %s)\n",
|
||||||
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||||
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
|
||||||
|
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||||
|
printf(" -t, --threads <n> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
|
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.cpu_mask, ",").c_str());
|
||||||
|
printf(" --cpu-strict <0|1> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||||
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
if (llama_supports_rpc()) {
|
if (llama_supports_rpc()) {
|
||||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||||
}
|
}
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n",
|
||||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
|
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||||
|
printf(" -mmp, --mmap <0|1> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
||||||
|
join(cmd_params_defaults.embeddings, ",").c_str());
|
||||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||||
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||||
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||||
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
|
||||||
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
output_format_str(cmd_params_defaults.output_format));
|
||||||
|
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
|
||||||
|
output_format_str(cmd_params_defaults.output_format_stderr));
|
||||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
printf(
|
||||||
|
"Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
|
||||||
|
"multiple times.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_type ggml_type_from_name(const std::string & s) {
|
static ggml_type ggml_type_from_name(const std::string & s) {
|
||||||
|
@ -281,7 +309,6 @@ static ggml_type ggml_type_from_name(const std::string & s) {
|
||||||
return GGML_TYPE_COUNT;
|
return GGML_TYPE_COUNT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
cmd_params params;
|
cmd_params params;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
|
@ -476,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
std::string value(argv[i]);
|
std::string value(argv[i]);
|
||||||
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
/**/ if (value == "distribute" || value == "") {
|
||||||
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
|
||||||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
} else if (value == "isolate") {
|
||||||
else { invalid_param = true; break; }
|
params.numa = GGML_NUMA_STRATEGY_ISOLATE;
|
||||||
|
} else if (value == "numactl") {
|
||||||
|
params.numa = GGML_NUMA_STRATEGY_NUMACTL;
|
||||||
|
} else {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (arg == "-fa" || arg == "--flash-attn") {
|
} else if (arg == "-fa" || arg == "--flash-attn") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -570,27 +603,69 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// set defaults
|
// set defaults
|
||||||
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
|
if (params.model.empty()) {
|
||||||
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
params.model = cmd_params_defaults.model;
|
||||||
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
}
|
||||||
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
|
if (params.n_prompt.empty()) {
|
||||||
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
params.n_prompt = cmd_params_defaults.n_prompt;
|
||||||
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
|
}
|
||||||
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
if (params.n_gen.empty()) {
|
||||||
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
params.n_gen = cmd_params_defaults.n_gen;
|
||||||
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
}
|
||||||
if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
|
if (params.n_pg.empty()) {
|
||||||
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
|
params.n_pg = cmd_params_defaults.n_pg;
|
||||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
}
|
||||||
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
if (params.n_batch.empty()) {
|
||||||
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
|
params.n_batch = cmd_params_defaults.n_batch;
|
||||||
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
}
|
||||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
if (params.n_ubatch.empty()) {
|
||||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
params.n_ubatch = cmd_params_defaults.n_ubatch;
|
||||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
}
|
||||||
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
if (params.type_k.empty()) {
|
||||||
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
params.type_k = cmd_params_defaults.type_k;
|
||||||
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
}
|
||||||
|
if (params.type_v.empty()) {
|
||||||
|
params.type_v = cmd_params_defaults.type_v;
|
||||||
|
}
|
||||||
|
if (params.n_gpu_layers.empty()) {
|
||||||
|
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
|
||||||
|
}
|
||||||
|
if (params.rpc_servers.empty()) {
|
||||||
|
params.rpc_servers = cmd_params_defaults.rpc_servers;
|
||||||
|
}
|
||||||
|
if (params.split_mode.empty()) {
|
||||||
|
params.split_mode = cmd_params_defaults.split_mode;
|
||||||
|
}
|
||||||
|
if (params.main_gpu.empty()) {
|
||||||
|
params.main_gpu = cmd_params_defaults.main_gpu;
|
||||||
|
}
|
||||||
|
if (params.no_kv_offload.empty()) {
|
||||||
|
params.no_kv_offload = cmd_params_defaults.no_kv_offload;
|
||||||
|
}
|
||||||
|
if (params.flash_attn.empty()) {
|
||||||
|
params.flash_attn = cmd_params_defaults.flash_attn;
|
||||||
|
}
|
||||||
|
if (params.tensor_split.empty()) {
|
||||||
|
params.tensor_split = cmd_params_defaults.tensor_split;
|
||||||
|
}
|
||||||
|
if (params.use_mmap.empty()) {
|
||||||
|
params.use_mmap = cmd_params_defaults.use_mmap;
|
||||||
|
}
|
||||||
|
if (params.embeddings.empty()) {
|
||||||
|
params.embeddings = cmd_params_defaults.embeddings;
|
||||||
|
}
|
||||||
|
if (params.n_threads.empty()) {
|
||||||
|
params.n_threads = cmd_params_defaults.n_threads;
|
||||||
|
}
|
||||||
|
if (params.cpu_mask.empty()) {
|
||||||
|
params.cpu_mask = cmd_params_defaults.cpu_mask;
|
||||||
|
}
|
||||||
|
if (params.cpu_strict.empty()) {
|
||||||
|
params.cpu_strict = cmd_params_defaults.cpu_strict;
|
||||||
|
}
|
||||||
|
if (params.poll.empty()) {
|
||||||
|
params.poll = cmd_params_defaults.poll;
|
||||||
|
}
|
||||||
|
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
@ -633,12 +708,8 @@ struct cmd_params_instance {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool equal_mparams(const cmd_params_instance & other) const {
|
bool equal_mparams(const cmd_params_instance & other) const {
|
||||||
return model == other.model &&
|
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
|
||||||
n_gpu_layers == other.n_gpu_layers &&
|
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
|
||||||
rpc_servers == other.rpc_servers &&
|
|
||||||
split_mode == other.split_mode &&
|
|
||||||
main_gpu == other.main_gpu &&
|
|
||||||
use_mmap == other.use_mmap &&
|
|
||||||
tensor_split == other.tensor_split;
|
tensor_split == other.tensor_split;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -662,6 +733,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
std::vector<cmd_params_instance> instances;
|
std::vector<cmd_params_instance> instances;
|
||||||
|
|
||||||
// this ordering minimizes the number of times that each model needs to be reloaded
|
// this ordering minimizes the number of times that each model needs to be reloaded
|
||||||
|
// clang-format off
|
||||||
for (const auto & m : params.model)
|
for (const auto & m : params.model)
|
||||||
for (const auto & nl : params.n_gpu_layers)
|
for (const auto & nl : params.n_gpu_layers)
|
||||||
for (const auto & rpc : params.rpc_servers)
|
for (const auto & rpc : params.rpc_servers)
|
||||||
|
@ -767,6 +839,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
instances.push_back(instance);
|
instances.push_back(instance);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
return instances;
|
return instances;
|
||||||
}
|
}
|
||||||
|
@ -834,28 +907,21 @@ struct test {
|
||||||
(void) ctx;
|
(void) ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t avg_ns() const {
|
uint64_t avg_ns() const { return ::avg(samples_ns); }
|
||||||
return ::avg(samples_ns);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t stdev_ns() const {
|
uint64_t stdev_ns() const { return ::stdev(samples_ns); }
|
||||||
return ::stdev(samples_ns);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<double> get_ts() const {
|
std::vector<double> get_ts() const {
|
||||||
int n_tokens = n_prompt + n_gen;
|
int n_tokens = n_prompt + n_gen;
|
||||||
std::vector<double> ts;
|
std::vector<double> ts;
|
||||||
std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
|
std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
|
||||||
|
[n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
|
||||||
return ts;
|
return ts;
|
||||||
}
|
}
|
||||||
|
|
||||||
double avg_ts() const {
|
double avg_ts() const { return ::avg(get_ts()); }
|
||||||
return ::avg(get_ts());
|
|
||||||
}
|
|
||||||
|
|
||||||
double stdev_ts() const {
|
double stdev_ts() const { return ::stdev(get_ts()); }
|
||||||
return ::stdev(get_ts());
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string get_backend() {
|
static std::string get_backend() {
|
||||||
std::vector<std::string> backends;
|
std::vector<std::string> backends;
|
||||||
|
@ -871,17 +937,11 @@ struct test {
|
||||||
|
|
||||||
static const std::vector<std::string> & get_fields() {
|
static const std::vector<std::string> & get_fields() {
|
||||||
static const std::vector<std::string> fields = {
|
static const std::vector<std::string> fields = {
|
||||||
"build_commit", "build_number",
|
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
||||||
"cpu_info", "gpu_info", "backends",
|
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||||
"n_batch", "n_ubatch",
|
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
|
||||||
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
"embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
|
||||||
"type_k", "type_v",
|
|
||||||
"n_gpu_layers", "split_mode",
|
|
||||||
"main_gpu", "no_kv_offload", "flash_attn",
|
|
||||||
"tensor_split", "use_mmap", "embeddings",
|
|
||||||
"n_prompt", "n_gen", "test_time",
|
|
||||||
"avg_ns", "stddev_ns",
|
|
||||||
"avg_ts", "stddev_ts",
|
"avg_ts", "stddev_ts",
|
||||||
};
|
};
|
||||||
return fields;
|
return fields;
|
||||||
|
@ -890,17 +950,14 @@ struct test {
|
||||||
enum field_type { STRING, BOOL, INT, FLOAT };
|
enum field_type { STRING, BOOL, INT, FLOAT };
|
||||||
|
|
||||||
static field_type get_field_type(const std::string & field) {
|
static field_type get_field_type(const std::string & field) {
|
||||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
||||||
field == "n_threads" || field == "poll" ||
|
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
||||||
field == "model_size" || field == "model_n_params" ||
|
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
|
||||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
field == "stddev_ns") {
|
||||||
field == "n_prompt" || field == "n_gen" ||
|
|
||||||
field == "avg_ns" || field == "stddev_ns") {
|
|
||||||
return INT;
|
return INT;
|
||||||
}
|
}
|
||||||
if (field == "f16_kv" || field == "no_kv_offload" ||
|
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
||||||
field == "cpu_strict" ||
|
field == "use_mmap" || field == "embeddings") {
|
||||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
if (field == "avg_ts" || field == "stddev_ts") {
|
if (field == "avg_ts" || field == "stddev_ts") {
|
||||||
|
@ -925,20 +982,38 @@ struct test {
|
||||||
tensor_split_str += "/";
|
tensor_split_str += "/";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::vector<std::string> values = {
|
std::vector<std::string> values = { build_commit,
|
||||||
build_commit, std::to_string(build_number),
|
std::to_string(build_number),
|
||||||
cpu_info, gpu_info, get_backend(),
|
cpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
gpu_info,
|
||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
get_backend(),
|
||||||
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
model_filename,
|
||||||
ggml_type_name(type_k), ggml_type_name(type_v),
|
model_type,
|
||||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
std::to_string(model_size),
|
||||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
std::to_string(model_n_params),
|
||||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
std::to_string(n_batch),
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_ubatch),
|
||||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
std::to_string(n_threads),
|
||||||
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
cpu_mask,
|
||||||
};
|
std::to_string(cpu_strict),
|
||||||
|
std::to_string(poll),
|
||||||
|
ggml_type_name(type_k),
|
||||||
|
ggml_type_name(type_v),
|
||||||
|
std::to_string(n_gpu_layers),
|
||||||
|
split_mode_str(split_mode),
|
||||||
|
std::to_string(main_gpu),
|
||||||
|
std::to_string(no_kv_offload),
|
||||||
|
std::to_string(flash_attn),
|
||||||
|
tensor_split_str,
|
||||||
|
std::to_string(use_mmap),
|
||||||
|
std::to_string(embeddings),
|
||||||
|
std::to_string(n_prompt),
|
||||||
|
std::to_string(n_gen),
|
||||||
|
test_time,
|
||||||
|
std::to_string(avg_ns()),
|
||||||
|
std::to_string(stdev_ns()),
|
||||||
|
std::to_string(avg_ts()),
|
||||||
|
std::to_string(stdev_ts()) };
|
||||||
return values;
|
return values;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -946,8 +1021,8 @@ struct test {
|
||||||
std::map<std::string, std::string> map;
|
std::map<std::string, std::string> map;
|
||||||
auto fields = get_fields();
|
auto fields = get_fields();
|
||||||
auto values = get_values();
|
auto values = get_values();
|
||||||
std::transform(fields.begin(), fields.end(), values.begin(),
|
std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
|
||||||
std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
|
std::make_pair<const std::string &, const std::string &>);
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -961,8 +1036,11 @@ struct printer {
|
||||||
virtual ~printer() {}
|
virtual ~printer() {}
|
||||||
|
|
||||||
FILE * fout;
|
FILE * fout;
|
||||||
|
|
||||||
virtual void print_header(const cmd_params & params) { (void) params; }
|
virtual void print_header(const cmd_params & params) { (void) params; }
|
||||||
|
|
||||||
virtual void print_test(const test & t) = 0;
|
virtual void print_test(const test & t) = 0;
|
||||||
|
|
||||||
virtual void print_footer() {}
|
virtual void print_footer() {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -992,7 +1070,6 @@ struct csv_printer : public printer {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static std::string escape_json(const std::string & value) {
|
static std::string escape_json(const std::string & value) {
|
||||||
std::string escaped;
|
std::string escaped;
|
||||||
for (auto c : value) {
|
for (auto c : value) {
|
||||||
|
@ -1033,7 +1110,8 @@ struct json_printer : public printer {
|
||||||
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
||||||
assert(fields.size() == values.size());
|
assert(fields.size() == values.size());
|
||||||
for (size_t i = 0; i < fields.size(); i++) {
|
for (size_t i = 0; i < fields.size(); i++) {
|
||||||
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
|
||||||
|
format_json_value(fields.at(i), values.at(i)).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1051,12 +1129,9 @@ struct json_printer : public printer {
|
||||||
fflush(fout);
|
fflush(fout);
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_footer() override {
|
void print_footer() override { fprintf(fout, "\n]\n"); }
|
||||||
fprintf(fout, "\n]\n");
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
struct jsonl_printer : public printer {
|
struct jsonl_printer : public printer {
|
||||||
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
||||||
assert(fields.size() == values.size());
|
assert(fields.size() == values.size());
|
||||||
|
@ -1303,7 +1378,8 @@ struct sql_printer : public printer {
|
||||||
std::vector<std::string> fields = test::get_fields();
|
std::vector<std::string> fields = test::get_fields();
|
||||||
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
|
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
|
||||||
for (size_t i = 0; i < fields.size(); i++) {
|
for (size_t i = 0; i < fields.size(); i++) {
|
||||||
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : "");
|
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
|
||||||
|
i < fields.size() - 1 ? "," : "");
|
||||||
}
|
}
|
||||||
fprintf(fout, ");\n");
|
fprintf(fout, ");\n");
|
||||||
fprintf(fout, "\n");
|
fprintf(fout, "\n");
|
||||||
|
@ -1505,13 +1581,15 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
if (params.progress) {
|
if (params.progress) {
|
||||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
|
||||||
|
i + 1, params.reps);
|
||||||
}
|
}
|
||||||
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
||||||
}
|
}
|
||||||
if (t.n_gen > 0) {
|
if (t.n_gen > 0) {
|
||||||
if (params.progress) {
|
if (params.progress) {
|
||||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
|
||||||
|
i + 1, params.reps);
|
||||||
}
|
}
|
||||||
test_gen(ctx, t.n_gen, t.n_threads);
|
test_gen(ctx, t.n_gen, t.n_threads);
|
||||||
}
|
}
|
||||||
|
|
|
@ -158,6 +158,7 @@ struct vk_device_struct {
|
||||||
std::string name;
|
std::string name;
|
||||||
uint64_t max_memory_allocation_size;
|
uint64_t max_memory_allocation_size;
|
||||||
bool fp16;
|
bool fp16;
|
||||||
|
bool pipeline_robustness;
|
||||||
vk::Device device;
|
vk::Device device;
|
||||||
uint32_t vendor_id;
|
uint32_t vendor_id;
|
||||||
vk_queue compute_queue;
|
vk_queue compute_queue;
|
||||||
|
@ -654,7 +655,7 @@ static uint32_t compile_count = 0;
|
||||||
static std::mutex compile_count_mutex;
|
static std::mutex compile_count_mutex;
|
||||||
static std::condition_variable compile_count_cond;
|
static std::condition_variable compile_count_cond;
|
||||||
|
|
||||||
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
|
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align, bool disable_robustness) {
|
||||||
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
||||||
GGML_ASSERT(parameter_count > 0);
|
GGML_ASSERT(parameter_count > 0);
|
||||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||||
|
@ -724,6 +725,15 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
vk::PipelineCreateFlags(),
|
vk::PipelineCreateFlags(),
|
||||||
pipeline_shader_create_info,
|
pipeline_shader_create_info,
|
||||||
pipeline->layout);
|
pipeline->layout);
|
||||||
|
|
||||||
|
vk::PipelineRobustnessCreateInfoEXT rci;
|
||||||
|
|
||||||
|
if (device->pipeline_robustness && disable_robustness) {
|
||||||
|
rci.storageBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled;
|
||||||
|
rci.uniformBuffers = vk::PipelineRobustnessBufferBehaviorEXT::eDisabled;
|
||||||
|
compute_pipeline_create_info.setPNext(&rci);
|
||||||
|
}
|
||||||
|
|
||||||
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -1261,7 +1271,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||||
|
|
||||||
std::vector<std::future<void>> compiles;
|
std::vector<std::future<void>> compiles;
|
||||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align) {
|
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) {
|
||||||
{
|
{
|
||||||
// wait until fewer than N compiles are in progress
|
// wait until fewer than N compiles are in progress
|
||||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||||
|
@ -1271,7 +1281,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
}
|
}
|
||||||
compile_count++;
|
compile_count++;
|
||||||
}
|
}
|
||||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
|
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
|
||||||
};
|
};
|
||||||
|
|
||||||
if (device->fp16) {
|
if (device->fp16) {
|
||||||
|
@ -1370,45 +1380,45 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
// computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
|
// computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
||||||
|
|
||||||
// dequant shaders
|
// dequant shaders
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||||
|
@ -1591,12 +1601,15 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
|
|
||||||
bool fp16_storage = false;
|
bool fp16_storage = false;
|
||||||
bool fp16_compute = false;
|
bool fp16_compute = false;
|
||||||
|
bool pipeline_robustness = false;
|
||||||
|
|
||||||
for (const auto& properties : ext_props) {
|
for (const auto& properties : ext_props) {
|
||||||
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
||||||
fp16_storage = true;
|
fp16_storage = true;
|
||||||
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||||
fp16_compute = true;
|
fp16_compute = true;
|
||||||
|
} else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
|
||||||
|
pipeline_robustness = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1642,10 +1655,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
|
vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
|
||||||
vk11_features.pNext = &vk12_features;
|
vk11_features.pNext = &vk12_features;
|
||||||
|
|
||||||
|
VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features;
|
||||||
|
pl_robustness_features.pNext = nullptr;
|
||||||
|
pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT;
|
||||||
|
pl_robustness_features.pipelineRobustness = VK_FALSE;
|
||||||
|
|
||||||
|
if (pipeline_robustness) {
|
||||||
|
vk12_features.pNext = &pl_robustness_features;
|
||||||
|
device_extensions.push_back("VK_EXT_pipeline_robustness");
|
||||||
|
}
|
||||||
|
|
||||||
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
|
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
|
||||||
|
|
||||||
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
|
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
|
||||||
|
|
||||||
|
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
|
||||||
|
|
||||||
if (!vk11_features.storageBuffer16BitAccess) {
|
if (!vk11_features.storageBuffer16BitAccess) {
|
||||||
std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
|
std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
|
||||||
throw std::runtime_error("Unsupported device");
|
throw std::runtime_error("Unsupported device");
|
||||||
|
@ -3190,7 +3215,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||||
|
|
||||||
if (ne01 > max_groups_x) {
|
if (ne01 > max_groups_x) {
|
||||||
groups_z = 64;
|
groups_z = 64;
|
||||||
groups_x /= groups_z;
|
groups_x = CEIL_DIV(groups_x, groups_z);
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
|
@ -3767,7 +3792,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||||
|
|
||||||
if (ne01 > max_groups_x) {
|
if (ne01 > max_groups_x) {
|
||||||
groups_z = 64;
|
groups_z = 64;
|
||||||
groups_x /= groups_z;
|
groups_x = CEIL_DIV(groups_x, groups_z);
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
|
|
|
@ -2,6 +2,15 @@
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#if defined(A_TYPE_PACKED16)
|
||||||
|
layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
|
||||||
|
#endif
|
||||||
|
#if defined(A_TYPE_PACKED32)
|
||||||
|
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_F32)
|
#if defined(DATA_A_F32)
|
||||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
|
return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
|
||||||
|
@ -20,6 +29,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
|
return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
|
||||||
}
|
}
|
||||||
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
|
const float d = float(data_a_packed16[a_offset + ib].d);
|
||||||
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
|
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q4_1)
|
#if defined(DATA_A_Q4_1)
|
||||||
|
@ -29,6 +43,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2(vui & 0xF, vui >> 4) * d + m;
|
return vec2(vui & 0xF, vui >> 4) * d + m;
|
||||||
}
|
}
|
||||||
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
|
const float d = float(data_a_packed16[a_offset + ib].d);
|
||||||
|
const float m = float(data_a_packed16[a_offset + ib].m);
|
||||||
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
|
return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q5_0)
|
#if defined(DATA_A_Q5_0)
|
||||||
|
@ -39,6 +59,14 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
|
return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
|
||||||
}
|
}
|
||||||
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
|
const float d = float(data_a_packed16[a_offset + ib].d);
|
||||||
|
const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
|
||||||
|
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||||
|
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
|
||||||
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
|
return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q5_1)
|
#if defined(DATA_A_Q5_1)
|
||||||
|
@ -50,6 +78,15 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
|
return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
|
||||||
}
|
}
|
||||||
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
|
const float d = float(data_a_packed16[a_offset + ib].d);
|
||||||
|
const float m = float(data_a_packed16[a_offset + ib].m);
|
||||||
|
const uint uint_qh = data_a_packed16[a_offset + ib].qh;
|
||||||
|
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||||
|
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
|
||||||
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
|
return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q8_0)
|
#if defined(DATA_A_Q8_0)
|
||||||
|
@ -57,6 +94,12 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[a_offset + ib].d);
|
const float d = float(data_a[a_offset + ib].d);
|
||||||
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
|
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
|
||||||
}
|
}
|
||||||
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
|
const float d = float(data_a_packed16[a_offset + ib].d);
|
||||||
|
uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
|
||||||
|
uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
|
||||||
|
return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ4_NL)
|
||||||
|
@ -65,4 +108,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
||||||
}
|
}
|
||||||
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
|
const float d = float(data_a_packed16[a_offset + ib].d);
|
||||||
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
|
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -10,6 +10,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||||
void main() {
|
void main() {
|
||||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||||
|
|
||||||
|
init_iq4nl_shmem();
|
||||||
|
|
||||||
const uint tid = gl_LocalInvocationID.x % 64;
|
const uint tid = gl_LocalInvocationID.x % 64;
|
||||||
const uint il = tid/32;
|
const uint il = tid/32;
|
||||||
const uint ir = tid%32;
|
const uint ir = tid%32;
|
||||||
|
|
|
@ -12,6 +12,10 @@ void main() {
|
||||||
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
||||||
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
||||||
|
|
||||||
|
#if defined(DATA_A_IQ4_NL)
|
||||||
|
init_iq4nl_shmem();
|
||||||
|
#endif
|
||||||
|
|
||||||
if (i00 >= p.ne00) {
|
if (i00 >= p.ne00) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#ifdef FLOAT16
|
#ifdef FLOAT16
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||||
#endif
|
#endif
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||||
|
|
||||||
#include "mul_mat_vec_base.comp"
|
#include "mul_mat_vec_base.comp"
|
||||||
|
|
||||||
|
@ -12,16 +12,48 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||||
|
|
||||||
|
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
||||||
|
#define K_PER_ITER 8
|
||||||
|
#else
|
||||||
|
#define K_PER_ITER 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset, y_offset;
|
uint a_offset, b_offset, d_offset, y_offset;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
|
||||||
|
|
||||||
void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
|
void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
|
||||||
{
|
{
|
||||||
const uint col = i*BLOCK_SIZE + 2*tid;
|
const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
|
||||||
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
||||||
const uint iybs = col - col%QUANT_K; // y block start index
|
const uint iybs = col - col%QUANT_K; // y block start index
|
||||||
|
|
||||||
|
#if K_PER_ITER == 8
|
||||||
|
#if QUANT_R == 2
|
||||||
|
B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
|
||||||
|
B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
|
||||||
|
FLOAT_TYPE b0 = FLOAT_TYPE(bv02.x);
|
||||||
|
FLOAT_TYPE b1 = FLOAT_TYPE(bv13.x);
|
||||||
|
FLOAT_TYPE b2 = FLOAT_TYPE(bv02.y);
|
||||||
|
FLOAT_TYPE b3 = FLOAT_TYPE(bv13.y);
|
||||||
|
FLOAT_TYPE b4 = FLOAT_TYPE(bv02.z);
|
||||||
|
FLOAT_TYPE b5 = FLOAT_TYPE(bv13.z);
|
||||||
|
FLOAT_TYPE b6 = FLOAT_TYPE(bv02.w);
|
||||||
|
FLOAT_TYPE b7 = FLOAT_TYPE(bv13.w);
|
||||||
|
#else
|
||||||
|
B_TYPE_VEC4 bv0 = data_b_v4[(b_offset + iybs + iqs) / 4];
|
||||||
|
B_TYPE_VEC4 bv1 = data_b_v4[(b_offset + iybs + iqs) / 4 + 1];
|
||||||
|
FLOAT_TYPE b0 = FLOAT_TYPE(bv0.x);
|
||||||
|
FLOAT_TYPE b1 = FLOAT_TYPE(bv0.y);
|
||||||
|
FLOAT_TYPE b2 = FLOAT_TYPE(bv0.z);
|
||||||
|
FLOAT_TYPE b3 = FLOAT_TYPE(bv0.w);
|
||||||
|
FLOAT_TYPE b4 = FLOAT_TYPE(bv1.x);
|
||||||
|
FLOAT_TYPE b5 = FLOAT_TYPE(bv1.y);
|
||||||
|
FLOAT_TYPE b6 = FLOAT_TYPE(bv1.z);
|
||||||
|
FLOAT_TYPE b7 = FLOAT_TYPE(bv1.w);
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
||||||
// accumulate it. We still fetch a pair of elements for A, which is fine for
|
// accumulate it. We still fetch a pair of elements for A, which is fine for
|
||||||
// quantized formats since they'll be within the same block. We should
|
// quantized formats since they'll be within the same block. We should
|
||||||
|
@ -34,9 +66,24 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
|
||||||
if (!OOB) {
|
if (!OOB) {
|
||||||
b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
|
b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index
|
const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index
|
||||||
|
|
||||||
|
#if K_PER_ITER == 8
|
||||||
|
const vec4 v = dequantize4(ib, iqs, a_offset);
|
||||||
|
const vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
|
||||||
|
|
||||||
|
// matrix multiplication
|
||||||
|
temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]);
|
||||||
|
temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
|
||||||
|
temp[n] = fma(FLOAT_TYPE(v.z), b2, temp[n]);
|
||||||
|
temp[n] = fma(FLOAT_TYPE(v.w), b3, temp[n]);
|
||||||
|
temp[n] = fma(FLOAT_TYPE(v2.x), b4, temp[n]);
|
||||||
|
temp[n] = fma(FLOAT_TYPE(v2.y), b5, temp[n]);
|
||||||
|
temp[n] = fma(FLOAT_TYPE(v2.z), b6, temp[n]);
|
||||||
|
temp[n] = fma(FLOAT_TYPE(v2.w), b7, temp[n]);
|
||||||
|
#else
|
||||||
const vec2 v = dequantize(ib, iqs, a_offset);
|
const vec2 v = dequantize(ib, iqs, a_offset);
|
||||||
|
|
||||||
// matrix multiplication
|
// matrix multiplication
|
||||||
|
@ -44,6 +91,7 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
|
||||||
if (!OOB) {
|
if (!OOB) {
|
||||||
temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
|
temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,22 +109,33 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
temp[i] = FLOAT_TYPE(0);
|
temp[i] = FLOAT_TYPE(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int unroll_count = 8;
|
uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
|
||||||
|
if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) {
|
||||||
const uint num_iters = (p.ncols >= 2*tid) ? ((p.ncols - 2*tid + BLOCK_SIZE - 1) / BLOCK_SIZE) : 0;
|
num_iters++;
|
||||||
const uint unrolled_iters = num_iters & ~(2*unroll_count - 1);
|
}
|
||||||
|
int unroll_count = 4;
|
||||||
|
uint unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||||
|
|
||||||
uint i = 0;
|
uint i = 0;
|
||||||
while (i < unrolled_iters) {
|
while (i < unrolled_iters) {
|
||||||
// Manually partially unroll the loop
|
// Manually partially unroll the loop
|
||||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||||
iter(temp, first_row, num_rows, tid, i, false);
|
iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
|
||||||
i += 2;
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unroll_count = 2;
|
||||||
|
unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||||
|
while (i < unrolled_iters) {
|
||||||
|
// Manually partially unroll the loop
|
||||||
|
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||||
|
iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
|
||||||
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (i < num_iters) {
|
while (i < num_iters) {
|
||||||
iter(temp, first_row, num_rows, tid, i, true);
|
iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true);
|
||||||
i += 2;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
@ -102,10 +161,17 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||||
void main() {
|
void main() {
|
||||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||||
|
|
||||||
|
#if defined(DATA_A_IQ4_NL)
|
||||||
|
init_iq4nl_shmem();
|
||||||
|
#endif
|
||||||
|
|
||||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||||
compute_outputs(first_row, NUM_ROWS);
|
compute_outputs(first_row, NUM_ROWS);
|
||||||
} else {
|
} else {
|
||||||
|
if (first_row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
compute_outputs(first_row, p.stride_d - first_row);
|
compute_outputs(first_row, p.stride_d - first_row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,6 +12,9 @@
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||||
|
layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
|
||||||
|
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
|
||||||
|
|
||||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||||
|
|
|
@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32];
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
|
if (row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,10 @@ shared FLOAT_TYPE tmp[32];
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
|
if (row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
|
|
|
@ -8,30 +8,14 @@ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
shared FLOAT_TYPE tmp[32];
|
shared FLOAT_TYPE tmp[32];
|
||||||
|
|
||||||
// Declare aliased versions of A and B bindings that can use 16b/32b loads for
|
|
||||||
// the quantized values, and vec4 loads for B.
|
|
||||||
struct block_q4_K_u32
|
|
||||||
{
|
|
||||||
f16vec2 d;
|
|
||||||
uint32_t scales[3*QUANT_K/64/4];
|
|
||||||
uint32_t qs[QUANT_K/2/4];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct block_q4_K_u16
|
|
||||||
{
|
|
||||||
f16vec2 d;
|
|
||||||
uint16_t scales[3*QUANT_K/64/2];
|
|
||||||
uint16_t qs[QUANT_K/2/2];
|
|
||||||
};
|
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer A_u32 {block_q4_K_u32 data_a_u32[];};
|
|
||||||
layout (binding = 0) readonly buffer A_u16 {block_q4_K_u16 data_a_u16[];};
|
|
||||||
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
|
|
||||||
|
|
||||||
// This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads
|
// This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
|
if (row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
|
@ -64,9 +48,9 @@ void main() {
|
||||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||||
|
|
||||||
uint32_t scale0_u32 = data_a_u16[ib0 + i].scales[v_im ];
|
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||||
uint32_t scale4_u32 = data_a_u16[ib0 + i].scales[v_im + 2];
|
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||||
uint32_t scale8_u32 = data_a_u16[ib0 + i].scales[v_im + 4];
|
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||||
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
||||||
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
||||||
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
||||||
|
@ -80,8 +64,8 @@ void main() {
|
||||||
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||||
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||||
|
|
||||||
uint32_t qs0_u32 = data_a_u32[ib0 + i].qs[q_offset / 4];
|
uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
|
||||||
uint32_t qs64_u32 = data_a_u32[ib0 + i].qs[q_offset / 4 + 16];
|
uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
|
||||||
|
|
||||||
uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
|
uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
|
||||||
uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
|
uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||||
|
|
||||||
#include "mul_mat_vec_base.comp"
|
#include "mul_mat_vec_base.comp"
|
||||||
|
|
||||||
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32];
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
|
if (row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
|
@ -31,70 +37,106 @@ void main() {
|
||||||
const uint8_t hm1 = uint8_t(1 << (2*v_im));
|
const uint8_t hm1 = uint8_t(1 << (2*v_im));
|
||||||
const uint8_t hm2 = uint8_t(hm1 << 4);
|
const uint8_t hm2 = uint8_t(hm1 << 4);
|
||||||
|
|
||||||
tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||||
|
|
||||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
|
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) {
|
||||||
const uint y1_idx = i * QUANT_K + y_offset;
|
const uint y1_idx = i * QUANT_K + y_offset;
|
||||||
const uint y2_idx = y1_idx + 128;
|
const uint y2_idx = y1_idx + 128;
|
||||||
|
|
||||||
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x);
|
f16vec2 d = data_a[ib0 + i].d;
|
||||||
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib0 + i].d.y);
|
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||||
|
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||||
|
|
||||||
const uint8_t sc0 = uint8_t( data_a[ib0 + i].scales[v_im * 2 ] & 0x3f);
|
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||||
const uint8_t sc1 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 1] & 0x3f);
|
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||||
const uint8_t sc2 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 4] & 0x3f);
|
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||||
const uint8_t sc3 = uint8_t( data_a[ib0 + i].scales[v_im * 2 + 5] & 0x3f);
|
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
||||||
const uint8_t sc4 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 8] & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 ] & 0xc0) >> 2));
|
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
||||||
const uint8_t sc5 = uint8_t(( data_a[ib0 + i].scales[v_im * 2 + 9] & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 1] & 0xc0) >> 2));
|
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
||||||
const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2));
|
|
||||||
const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2));
|
|
||||||
|
|
||||||
const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf);
|
const uint32_t sc0 = ( scale0.x & 0x3f);
|
||||||
const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf);
|
const uint32_t sc1 = ( scale0.y & 0x3f);
|
||||||
const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset + 16] & 0xf);
|
const uint32_t sc2 = ( scale4.x & 0x3f);
|
||||||
const uint8_t q4_3 = uint8_t(data_a[ib0 + i].qs[q_offset + 17] & 0xf);
|
const uint32_t sc3 = ( scale4.y & 0x3f);
|
||||||
const uint8_t q4_4 = uint8_t(data_a[ib0 + i].qs[q_offset ] >> 4);
|
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
|
||||||
const uint8_t q4_5 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] >> 4);
|
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
|
||||||
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 16] >> 4);
|
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||||
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 17] >> 4);
|
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||||
const uint8_t q4_8 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf);
|
|
||||||
const uint8_t q4_9 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf);
|
uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
|
||||||
const uint8_t q4_10 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] & 0xf);
|
uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
|
||||||
const uint8_t q4_11 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] & 0xf);
|
|
||||||
const uint8_t q4_12 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
|
uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
|
||||||
const uint8_t q4_13 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
|
uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
|
||||||
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4);
|
uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
|
||||||
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4);
|
uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
|
||||||
|
|
||||||
|
uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
|
||||||
|
uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
|
||||||
|
uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
|
||||||
|
uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
|
||||||
|
|
||||||
|
const uint32_t q4_0 = qs0_16_lo4.x;
|
||||||
|
const uint32_t q4_1 = qs0_16_lo4.y;
|
||||||
|
const uint32_t q4_2 = qs0_16_lo4.z;
|
||||||
|
const uint32_t q4_3 = qs0_16_lo4.w;
|
||||||
|
const uint32_t q4_4 = qs0_16_hi4.x;
|
||||||
|
const uint32_t q4_5 = qs0_16_hi4.y;
|
||||||
|
const uint32_t q4_6 = qs0_16_hi4.z;
|
||||||
|
const uint32_t q4_7 = qs0_16_hi4.w;
|
||||||
|
const uint32_t q4_8 = qs64_80_lo4.x;
|
||||||
|
const uint32_t q4_9 = qs64_80_lo4.y;
|
||||||
|
const uint32_t q4_10 = qs64_80_lo4.z;
|
||||||
|
const uint32_t q4_11 = qs64_80_lo4.w;
|
||||||
|
const uint32_t q4_12 = qs64_80_hi4.x;
|
||||||
|
const uint32_t q4_13 = qs64_80_hi4.y;
|
||||||
|
const uint32_t q4_14 = qs64_80_hi4.z;
|
||||||
|
const uint32_t q4_15 = qs64_80_hi4.w;
|
||||||
|
|
||||||
|
B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2];
|
||||||
|
B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
|
||||||
|
B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
|
||||||
|
B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24];
|
||||||
|
B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2];
|
||||||
|
B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8];
|
||||||
|
B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
|
||||||
|
B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
|
||||||
|
|
||||||
|
uint32_t qh0 = data_a_packed16[ib0 + i].qh[l0 / 2];
|
||||||
|
uint32_t qh1 = qh0 >> 8;
|
||||||
|
uint32_t qh16 = data_a_packed16[ib0 + i].qh[l0 / 2 + 8];
|
||||||
|
uint32_t qh17 = qh16 >> 8;
|
||||||
|
|
||||||
const FLOAT_TYPE sx =
|
const FLOAT_TYPE sx =
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by10.x), (q4_0 + (((qh0 & hm1) != 0) ? 16 : 0)),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by10.y), (q4_1 + (((qh1 & hm1) != 0) ? 16 : 0)),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by116.x), (q4_2 + (((qh16 & hm1) != 0) ? 16 : 0)),
|
||||||
FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0)))));
|
FLOAT_TYPE(by116.y) * (q4_3 + (((qh17 & hm1) != 0) ? 16 : 0)))));
|
||||||
const FLOAT_TYPE sy =
|
const FLOAT_TYPE sy =
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by132.x), (q4_4 + (((qh0 & (hm1 << 1)) != 0) ? 16 : 0)),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by132.y), (q4_5 + (((qh1 & (hm1 << 1)) != 0) ? 16 : 0)),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by148.x), (q4_6 + (((qh16 & (hm1 << 1)) != 0) ? 16 : 0)),
|
||||||
FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0)))));
|
FLOAT_TYPE(by148.y) * (q4_7 + (((qh17 & (hm1 << 1)) != 0) ? 16 : 0)))));
|
||||||
const FLOAT_TYPE sz =
|
const FLOAT_TYPE sz =
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by20.x), (q4_8 + (((qh0 & hm2) != 0) ? 16 : 0)),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by20.y), (q4_9 + (((qh1 & hm2) != 0) ? 16 : 0)),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by216.x), (q4_10 + (((qh16 & hm2) != 0) ? 16 : 0)),
|
||||||
FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0)))));
|
FLOAT_TYPE(by216.y) * (q4_11 + (((qh17 & hm2) != 0) ? 16 : 0)))));
|
||||||
const FLOAT_TYPE sw =
|
const FLOAT_TYPE sw =
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by232.x), (q4_12 + (((qh0 & (hm2 << 1)) != 0) ? 16 : 0)),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by232.y), (q4_13 + (((qh1 & (hm2 << 1)) != 0) ? 16 : 0)),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)),
|
fma(FLOAT_TYPE(by248.x), (q4_14 + (((qh16 & (hm2 << 1)) != 0) ? 16 : 0)),
|
||||||
FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0)))));
|
FLOAT_TYPE(by248.y) * (q4_15 + (((qh17 & (hm2 << 1)) != 0) ? 16 : 0)))));
|
||||||
const FLOAT_TYPE smin =
|
const FLOAT_TYPE smin =
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2,
|
fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3,
|
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6,
|
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
|
||||||
(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7)));
|
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
|
||||||
const uint tmp_idx = 16 * ix + tid;
|
temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
|
||||||
tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tmp[gl_LocalInvocationID.x] = temp;
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
barrier();
|
barrier();
|
||||||
[[unroll]] for (uint s = 16; s > 0; s >>= 1) {
|
[[unroll]] for (uint s = 16; s > 0; s >>= 1) {
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||||
|
|
||||||
#include "mul_mat_vec_base.comp"
|
#include "mul_mat_vec_base.comp"
|
||||||
|
|
||||||
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
@ -9,6 +11,10 @@ shared FLOAT_TYPE tmp[32];
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
|
||||||
|
|
||||||
|
if (row >= p.stride_d) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
uint a_offset, b_offset, d_offset;
|
uint a_offset, b_offset, d_offset;
|
||||||
get_offsets(a_offset, b_offset, d_offset);
|
get_offsets(a_offset, b_offset, d_offset);
|
||||||
|
|
||||||
|
@ -36,35 +42,60 @@ void main() {
|
||||||
const uint s_offset = 8*v_im + is;
|
const uint s_offset = 8*v_im + is;
|
||||||
const uint y_offset = 128*v_im + l0;
|
const uint y_offset = 128*v_im + l0;
|
||||||
|
|
||||||
tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
|
||||||
|
|
||||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||||
const uint y_idx = i * QUANT_K + y_offset;
|
const uint y_idx = i * QUANT_K + y_offset;
|
||||||
|
|
||||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||||
|
|
||||||
#if K_QUANTS_PER_ITERATION == 1
|
FLOAT_TYPE scales[4];
|
||||||
const uint tmp_idx = 16 * ix + tid;
|
scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
|
||||||
tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32),
|
scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32),
|
scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32),
|
scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32),
|
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32),
|
uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32),
|
uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32),
|
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx]))))))));
|
uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
|
||||||
#else
|
uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
|
||||||
|
uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
|
||||||
|
uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
|
||||||
|
|
||||||
|
uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
|
||||||
|
uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
|
||||||
|
uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
|
||||||
|
uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
|
||||||
|
uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
|
||||||
|
|
||||||
|
uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32;
|
||||||
|
uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
|
||||||
|
uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
|
||||||
|
uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
|
||||||
|
|
||||||
|
uvec4 q0 = uvec4(unpack8(q0_u32));
|
||||||
|
uvec4 q1 = uvec4(unpack8(q1_u32));
|
||||||
|
uvec4 q2 = uvec4(unpack8(q2_u32));
|
||||||
|
uvec4 q3 = uvec4(unpack8(q3_u32));
|
||||||
|
|
||||||
|
B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4];
|
||||||
|
B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
|
||||||
|
B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
|
||||||
|
B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
|
||||||
|
|
||||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||||
[[unroll]] for (int l = 0; l < 4; ++l) {
|
[[unroll]] for (int l = 0; l < 4; ++l) {
|
||||||
sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32),
|
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32),
|
fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32),
|
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
|
||||||
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum))));
|
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
|
||||||
}
|
}
|
||||||
tmp[16 * ix + tid] += sum;
|
temp += sum * d;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tmp[gl_LocalInvocationID.x] = temp;
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
barrier();
|
barrier();
|
||||||
[[unroll]] for (uint s = 16; s > 0; s >>= 1) {
|
[[unroll]] for (uint s = 16; s > 0; s >>= 1) {
|
||||||
|
|
|
@ -75,6 +75,10 @@ shared u16vec2 row_ids[3072];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
#if defined(DATA_A_IQ4_NL)
|
||||||
|
init_iq4nl_shmem();
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
#if !defined(GGML_TYPES_COMP)
|
||||||
#endif
|
#define GGML_TYPES_COMP
|
||||||
|
|
||||||
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||||
|
|
||||||
#if defined(DATA_A_F32)
|
#if defined(DATA_A_F32)
|
||||||
#define QUANT_K 1
|
#define QUANT_K 1
|
||||||
|
@ -38,8 +40,14 @@ struct block_q4_0
|
||||||
float16_t d;
|
float16_t d;
|
||||||
uint8_t qs[16];
|
uint8_t qs[16];
|
||||||
};
|
};
|
||||||
|
struct block_q4_0_packed16
|
||||||
|
{
|
||||||
|
float16_t d;
|
||||||
|
uint16_t qs[16/2];
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q4_0
|
#define A_TYPE block_q4_0
|
||||||
|
#define A_TYPE_PACKED16 block_q4_0_packed16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q4_1)
|
#if defined(DATA_A_Q4_1)
|
||||||
|
@ -54,7 +62,15 @@ struct block_q4_1
|
||||||
uint8_t qs[16];
|
uint8_t qs[16];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct block_q4_1_packed16
|
||||||
|
{
|
||||||
|
float16_t d;
|
||||||
|
float16_t m;
|
||||||
|
uint16_t qs[16/2];
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q4_1
|
#define A_TYPE block_q4_1
|
||||||
|
#define A_TYPE_PACKED16 block_q4_1_packed16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q5_0)
|
#if defined(DATA_A_Q5_0)
|
||||||
|
@ -70,7 +86,15 @@ struct block_q5_0
|
||||||
uint8_t qs[16];
|
uint8_t qs[16];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct block_q5_0_packed16
|
||||||
|
{
|
||||||
|
float16_t d;
|
||||||
|
uint16_t qh[2];
|
||||||
|
uint16_t qs[16/2];
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q5_0
|
#define A_TYPE block_q5_0
|
||||||
|
#define A_TYPE_PACKED16 block_q5_0_packed16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q5_1)
|
#if defined(DATA_A_Q5_1)
|
||||||
|
@ -87,7 +111,16 @@ struct block_q5_1
|
||||||
uint8_t qs[16];
|
uint8_t qs[16];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct block_q5_1_packed16
|
||||||
|
{
|
||||||
|
float16_t d;
|
||||||
|
float16_t m;
|
||||||
|
uint qh;
|
||||||
|
uint16_t qs[16/2];
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q5_1
|
#define A_TYPE block_q5_1
|
||||||
|
#define A_TYPE_PACKED16 block_q5_1_packed16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q8_0)
|
#if defined(DATA_A_Q8_0)
|
||||||
|
@ -100,8 +133,14 @@ struct block_q8_0
|
||||||
float16_t d;
|
float16_t d;
|
||||||
int8_t qs[32];
|
int8_t qs[32];
|
||||||
};
|
};
|
||||||
|
struct block_q8_0_packed16
|
||||||
|
{
|
||||||
|
float16_t d;
|
||||||
|
uint16_t qs[32/2];
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q8_0
|
#define A_TYPE block_q8_0
|
||||||
|
#define A_TYPE_PACKED16 block_q8_0_packed16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// K-quants
|
// K-quants
|
||||||
|
@ -116,7 +155,23 @@ struct block_q2_K
|
||||||
f16vec2 d;
|
f16vec2 d;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct block_q2_K_packed16
|
||||||
|
{
|
||||||
|
uint16_t scales[QUANT_K/16/2];
|
||||||
|
uint16_t qs[QUANT_K/4/2];
|
||||||
|
f16vec2 d;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct block_q2_K_packed32
|
||||||
|
{
|
||||||
|
uint32_t scales[QUANT_K/16/4];
|
||||||
|
uint32_t qs[QUANT_K/4/4];
|
||||||
|
f16vec2 d;
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q2_K
|
#define A_TYPE block_q2_K
|
||||||
|
#define A_TYPE_PACKED16 block_q2_K_packed16
|
||||||
|
#define A_TYPE_PACKED32 block_q2_K_packed32
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q3_K)
|
#if defined(DATA_A_Q3_K)
|
||||||
|
@ -131,7 +186,16 @@ struct block_q3_K
|
||||||
float16_t d;
|
float16_t d;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct block_q3_K_packed16
|
||||||
|
{
|
||||||
|
uint16_t hmask[QUANT_K/8/2];
|
||||||
|
uint16_t qs[QUANT_K/4/2];
|
||||||
|
uint16_t scales[12/2];
|
||||||
|
float16_t d;
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q3_K
|
#define A_TYPE block_q3_K
|
||||||
|
#define A_TYPE_PACKED16 block_q3_K_packed16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q4_K)
|
#if defined(DATA_A_Q4_K)
|
||||||
|
@ -145,7 +209,23 @@ struct block_q4_K
|
||||||
uint8_t qs[QUANT_K/2];
|
uint8_t qs[QUANT_K/2];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct block_q4_K_packed16
|
||||||
|
{
|
||||||
|
f16vec2 d;
|
||||||
|
uint16_t scales[3*QUANT_K/64/2];
|
||||||
|
uint16_t qs[QUANT_K/2/2];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct block_q4_K_packed32
|
||||||
|
{
|
||||||
|
f16vec2 d;
|
||||||
|
uint32_t scales[3*QUANT_K/64/4];
|
||||||
|
uint32_t qs[QUANT_K/2/4];
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q4_K
|
#define A_TYPE block_q4_K
|
||||||
|
#define A_TYPE_PACKED16 block_q4_K_packed16
|
||||||
|
#define A_TYPE_PACKED32 block_q4_K_packed32
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q5_K)
|
#if defined(DATA_A_Q5_K)
|
||||||
|
@ -160,7 +240,16 @@ struct block_q5_K
|
||||||
uint8_t qs[QUANT_K/2];
|
uint8_t qs[QUANT_K/2];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct block_q5_K_packed16
|
||||||
|
{
|
||||||
|
f16vec2 d;
|
||||||
|
uint16_t scales[12/2];
|
||||||
|
uint16_t qh[QUANT_K/8/2];
|
||||||
|
uint16_t qs[QUANT_K/2/2];
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q5_K
|
#define A_TYPE block_q5_K
|
||||||
|
#define A_TYPE_PACKED16 block_q5_K_packed16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q6_K)
|
#if defined(DATA_A_Q6_K)
|
||||||
|
@ -175,7 +264,16 @@ struct block_q6_K
|
||||||
float16_t d;
|
float16_t d;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct block_q6_K_packed16
|
||||||
|
{
|
||||||
|
uint16_t ql[QUANT_K/2/2];
|
||||||
|
uint16_t qh[QUANT_K/4/2];
|
||||||
|
int8_t scales[QUANT_K/16];
|
||||||
|
float16_t d;
|
||||||
|
};
|
||||||
|
|
||||||
#define A_TYPE block_q6_K
|
#define A_TYPE block_q6_K
|
||||||
|
#define A_TYPE_PACKED16 block_q6_K_packed16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// IQuants
|
// IQuants
|
||||||
|
@ -191,10 +289,30 @@ struct block_iq4_nl
|
||||||
uint8_t qs[QUANT_K/2];
|
uint8_t qs[QUANT_K/2];
|
||||||
};
|
};
|
||||||
|
|
||||||
#define A_TYPE block_iq4_nl
|
struct block_iq4_nl_packed16
|
||||||
|
{
|
||||||
|
float16_t d;
|
||||||
|
uint16_t qs[QUANT_K/2/2];
|
||||||
|
};
|
||||||
|
|
||||||
const int8_t kvalues_iq4nl[16] = {
|
#define A_TYPE block_iq4_nl
|
||||||
|
#define A_TYPE_PACKED16 block_iq4_nl_packed16
|
||||||
|
|
||||||
|
const int8_t kvalues_iq4nl_const[16] = {
|
||||||
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
|
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
|
||||||
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
shared FLOAT_TYPE kvalues_iq4nl[16];
|
||||||
|
|
||||||
|
void init_iq4nl_shmem()
|
||||||
|
{
|
||||||
|
// copy the table into shared memory and sync
|
||||||
|
if (gl_LocalInvocationIndex.x < 16) {
|
||||||
|
kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]);
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif // !defined(GGML_TYPES_COMP)
|
||||||
|
|
|
@ -317,10 +317,10 @@ void process_shaders() {
|
||||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||||
std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||||
|
|
||||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
||||||
|
|
||||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||||
|
|
||||||
// Dequant shaders
|
// Dequant shaders
|
||||||
if (tname != "f16") {
|
if (tname != "f16") {
|
||||||
|
@ -331,11 +331,11 @@ void process_shaders() {
|
||||||
shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
|
shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
|
||||||
|
|
||||||
if (tname == "f16") {
|
if (tname == "f16") {
|
||||||
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
|
||||||
} else {
|
} else {
|
||||||
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
|
||||||
}
|
}
|
||||||
string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
|
string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue