Merge branch 'master' into move-convert-py
This commit is contained in:
commit
36558d9795
69 changed files with 3963 additions and 7977 deletions
5
.github/labeler.yml
vendored
5
.github/labeler.yml
vendored
|
@ -62,6 +62,8 @@ server:
|
||||||
ggml:
|
ggml:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
- ggml.c
|
||||||
|
- ggml.h
|
||||||
- ggml-*.c
|
- ggml-*.c
|
||||||
- ggml-*.h
|
- ggml-*.h
|
||||||
- ggml-cuda/**
|
- ggml-cuda/**
|
||||||
|
@ -71,3 +73,6 @@ nix:
|
||||||
- "**/*.nix"
|
- "**/*.nix"
|
||||||
- .github/workflows/nix-*.yml
|
- .github/workflows/nix-*.yml
|
||||||
- .devops/nix/nixpkgs-instances.nix
|
- .devops/nix/nixpkgs-instances.nix
|
||||||
|
embedding:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file: examples/embedding/
|
||||||
|
|
29
.github/workflows/zig-build.yml
vendored
29
.github/workflows/zig-build.yml
vendored
|
@ -1,29 +0,0 @@
|
||||||
name: Zig CI
|
|
||||||
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build:
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
runs-on: [ubuntu-latest, macos-latest, windows-latest]
|
|
||||||
runs-on: ${{ matrix.runs-on }}
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
fetch-depth: 0
|
|
||||||
- uses: goto-bus-stop/setup-zig@v2
|
|
||||||
with:
|
|
||||||
version: 0.11.0
|
|
||||||
- name: Build Summary
|
|
||||||
run: zig build --summary all -freference-trace
|
|
|
@ -124,7 +124,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||||
option(LLAMA_RPC "llama: use RPC" OFF)
|
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
|
||||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||||
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
|
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
|
||||||
|
@ -384,10 +383,6 @@ if (LLAMA_LLAMAFILE)
|
||||||
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
|
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_QKK_64)
|
|
||||||
add_compile_definitions(GGML_QKK_64)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS)
|
||||||
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
|
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
|
||||||
set(LLAMA_CUDA ON)
|
set(LLAMA_CUDA ON)
|
||||||
|
@ -505,6 +500,12 @@ if (LLAMA_VULKAN)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_VULKAN)
|
add_compile_definitions(GGML_USE_VULKAN)
|
||||||
|
|
||||||
|
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
|
||||||
|
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
|
||||||
|
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||||
|
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_VULKAN_CHECK_RESULTS)
|
if (LLAMA_VULKAN_CHECK_RESULTS)
|
||||||
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
|
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
|
||||||
endif()
|
endif()
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -389,10 +389,6 @@ else
|
||||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_QKK_64
|
|
||||||
MK_CPPFLAGS += -DGGML_QKK_64
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifndef LLAMA_NO_ACCELERATE
|
ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac OS - include Accelerate framework.
|
# Mac OS - include Accelerate framework.
|
||||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||||
|
|
|
@ -127,6 +127,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
- [x] [OLMo](https://allenai.org/olmo)
|
- [x] [OLMo](https://allenai.org/olmo)
|
||||||
|
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||||
|
|
||||||
|
@ -140,6 +141,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||||
|
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||||
|
|
||||||
**HTTP server**
|
**HTTP server**
|
||||||
|
|
||||||
|
|
172
build.zig
172
build.zig
|
@ -1,172 +0,0 @@
|
||||||
// Compatible with Zig Version 0.11.0
|
|
||||||
const std = @import("std");
|
|
||||||
const ArrayList = std.ArrayList;
|
|
||||||
const Compile = std.Build.Step.Compile;
|
|
||||||
const ConfigHeader = std.Build.Step.ConfigHeader;
|
|
||||||
const Mode = std.builtin.Mode;
|
|
||||||
const CrossTarget = std.zig.CrossTarget;
|
|
||||||
|
|
||||||
const Maker = struct {
|
|
||||||
builder: *std.build.Builder,
|
|
||||||
target: CrossTarget,
|
|
||||||
optimize: Mode,
|
|
||||||
enable_lto: bool,
|
|
||||||
|
|
||||||
include_dirs: ArrayList([]const u8),
|
|
||||||
cflags: ArrayList([]const u8),
|
|
||||||
cxxflags: ArrayList([]const u8),
|
|
||||||
objs: ArrayList(*Compile),
|
|
||||||
|
|
||||||
fn addInclude(m: *Maker, dir: []const u8) !void {
|
|
||||||
try m.include_dirs.append(dir);
|
|
||||||
}
|
|
||||||
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
|
|
||||||
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
|
|
||||||
}
|
|
||||||
fn addCFlag(m: *Maker, flag: []const u8) !void {
|
|
||||||
try m.cflags.append(flag);
|
|
||||||
}
|
|
||||||
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
|
|
||||||
try m.cxxflags.append(flag);
|
|
||||||
}
|
|
||||||
fn addFlag(m: *Maker, flag: []const u8) !void {
|
|
||||||
try m.addCFlag(flag);
|
|
||||||
try m.addCxxFlag(flag);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn init(builder: *std.build.Builder) !Maker {
|
|
||||||
const target = builder.standardTargetOptions(.{});
|
|
||||||
const zig_version = @import("builtin").zig_version_string;
|
|
||||||
const commit_hash = try std.ChildProcess.exec(
|
|
||||||
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
|
||||||
);
|
|
||||||
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
|
||||||
\\int LLAMA_BUILD_NUMBER = {};
|
|
||||||
\\char const *LLAMA_COMMIT = "{s}";
|
|
||||||
\\char const *LLAMA_COMPILER = "Zig {s}";
|
|
||||||
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
|
||||||
\\
|
|
||||||
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
|
||||||
var m = Maker{
|
|
||||||
.builder = builder,
|
|
||||||
.target = target,
|
|
||||||
.optimize = builder.standardOptimizeOption(.{}),
|
|
||||||
.enable_lto = false,
|
|
||||||
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
|
||||||
.cflags = ArrayList([]const u8).init(builder.allocator),
|
|
||||||
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
|
||||||
.objs = ArrayList(*Compile).init(builder.allocator),
|
|
||||||
};
|
|
||||||
|
|
||||||
try m.addCFlag("-std=c11");
|
|
||||||
try m.addCxxFlag("-std=c++11");
|
|
||||||
try m.addProjectInclude(&.{});
|
|
||||||
try m.addProjectInclude(&.{"common"});
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
|
||||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
|
||||||
if (o.target.getAbi() != .msvc)
|
|
||||||
o.defineCMacro("_GNU_SOURCE", null);
|
|
||||||
|
|
||||||
if (std.mem.endsWith(u8, src, ".c")) {
|
|
||||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
|
||||||
o.linkLibC();
|
|
||||||
} else {
|
|
||||||
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
|
||||||
if (o.target.getAbi() == .msvc) {
|
|
||||||
o.linkLibC(); // need winsdk + crt
|
|
||||||
} else {
|
|
||||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
|
||||||
o.linkLibCpp();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
|
||||||
o.want_lto = m.enable_lto;
|
|
||||||
return o;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
|
|
||||||
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
|
||||||
e.addCSourceFiles(&.{src}, m.cxxflags.items);
|
|
||||||
for (deps) |d| e.addObject(d);
|
|
||||||
for (m.objs.items) |o| e.addObject(o);
|
|
||||||
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
|
||||||
|
|
||||||
// https://github.com/ziglang/zig/issues/15448
|
|
||||||
if (e.target.getAbi() == .msvc) {
|
|
||||||
e.linkLibC(); // need winsdk + crt
|
|
||||||
} else {
|
|
||||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
|
||||||
e.linkLibCpp();
|
|
||||||
}
|
|
||||||
m.builder.installArtifact(e);
|
|
||||||
e.want_lto = m.enable_lto;
|
|
||||||
return e;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn build(b: *std.build.Builder) !void {
|
|
||||||
var make = try Maker.init(b);
|
|
||||||
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
|
||||||
|
|
||||||
const ggml = make.obj("ggml", "ggml.c");
|
|
||||||
const sgemm = make.obj("sgemm", "sgemm.cpp");
|
|
||||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
|
||||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
|
||||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
|
||||||
const unicode = make.obj("unicode", "unicode.cpp");
|
|
||||||
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
|
|
||||||
const llama = make.obj("llama", "llama.cpp");
|
|
||||||
const buildinfo = make.obj("common", "common/build-info.cpp");
|
|
||||||
const common = make.obj("common", "common/common.cpp");
|
|
||||||
const console = make.obj("console", "common/console.cpp");
|
|
||||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
|
||||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
|
||||||
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
|
|
||||||
const train = make.obj("train", "common/train.cpp");
|
|
||||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
|
||||||
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
|
||||||
|
|
||||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
|
|
||||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
||||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
||||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
|
||||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
|
||||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
|
||||||
|
|
||||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
|
|
||||||
if (server.target.isWindows()) {
|
|
||||||
server.linkSystemLibrary("ws2_32");
|
|
||||||
}
|
|
||||||
|
|
||||||
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
|
|
||||||
for (server_assets) |asset| {
|
|
||||||
const input_path = b.fmt("examples/server/public/{s}", .{asset});
|
|
||||||
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
|
|
||||||
|
|
||||||
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
|
|
||||||
|
|
||||||
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
|
|
||||||
defer b.allocator.free(input);
|
|
||||||
|
|
||||||
var buf = std.ArrayList(u8).init(b.allocator);
|
|
||||||
defer buf.deinit();
|
|
||||||
|
|
||||||
for (input) |byte| {
|
|
||||||
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
|
|
||||||
}
|
|
||||||
|
|
||||||
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
|
|
||||||
defer b.allocator.free(name);
|
|
||||||
std.mem.replaceScalar(u8, name, '.', '_');
|
|
||||||
|
|
||||||
try std.fs.cwd().writeFile(output_path, b.fmt(
|
|
||||||
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
|
|
||||||
.{ name, buf.items, name, input.len },
|
|
||||||
));
|
|
||||||
|
|
||||||
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
|
|
||||||
}
|
|
||||||
}
|
|
423
ci/run.sh
423
ci/run.sh
|
@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_get_model {
|
function gg_get_model {
|
||||||
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
||||||
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
||||||
if [[ -s $gguf_3b ]]; then
|
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||||
echo -n "$gguf_3b"
|
if [[ -s $gguf_0 ]]; then
|
||||||
elif [[ -s $gguf_7b ]]; then
|
echo -n "$gguf_0"
|
||||||
echo -n "$gguf_7b"
|
elif [[ -s $gguf_1 ]]; then
|
||||||
|
echo -n "$gguf_1"
|
||||||
|
elif [[ -s $gguf_2 ]]; then
|
||||||
|
echo -n "$gguf_2"
|
||||||
else
|
else
|
||||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_3b_v2
|
|
||||||
|
|
||||||
function gg_run_open_llama_3b_v2 {
|
|
||||||
cd ${SRC}
|
|
||||||
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
|
||||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
|
||||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
|
||||||
|
|
||||||
path_models="../models-mnt/open-llama/3B-v2"
|
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
|
||||||
|
|
||||||
python3 ../examples/convert-legacy-llama.py ${path_models}
|
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
|
||||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
|
||||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
|
||||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
|
||||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
|
||||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
|
||||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
|
||||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
|
||||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
|
||||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
|
||||||
|
|
||||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
|
||||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
|
||||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
|
||||||
|
|
||||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
|
||||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
|
||||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
|
||||||
|
|
||||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
|
||||||
|
|
||||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
|
||||||
|
|
||||||
function check_ppl {
|
|
||||||
qnt="$1"
|
|
||||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
|
||||||
|
|
||||||
function gg_sum_open_llama_3b_v2 {
|
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
|
||||||
|
|
||||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
|
||||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
|
||||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
|
||||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
|
||||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
|
||||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
|
||||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
|
||||||
}
|
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_7b_v2
|
||||||
# requires: GG_BUILD_CUDA
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
|
@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert-legacy-llama.py ${path_models}
|
python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# pythia_1.4b
|
||||||
|
|
||||||
|
function gg_run_pythia_1_4b {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
|
||||||
|
|
||||||
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||||
|
|
||||||
|
path_models="../models-mnt/pythia/1.4B"
|
||||||
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||||
|
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||||
|
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||||
|
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||||
|
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||||
|
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
|
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
|
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
|
function check_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_pythia_1_4b {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Pythia 1.4B:\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||||
|
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||||
|
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||||
|
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||||
|
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||||
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# pythia_2_8b
|
||||||
|
# requires: GG_BUILD_CUDA
|
||||||
|
|
||||||
|
function gg_run_pythia_2_8b {
|
||||||
|
cd ${SRC}
|
||||||
|
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
|
||||||
|
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
|
||||||
|
|
||||||
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
|
||||||
|
path_models="../models-mnt/pythia/2.8B"
|
||||||
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||||
|
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||||
|
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||||
|
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||||
|
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||||
|
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||||
|
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||||
|
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||||
|
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||||
|
|
||||||
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
|
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
|
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
|
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
|
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
|
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
|
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
|
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
|
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
|
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
|
function check_ppl {
|
||||||
|
qnt="$1"
|
||||||
|
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||||
|
|
||||||
|
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||||
|
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||||
|
return 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||||
|
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||||
|
|
||||||
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
|
set +e
|
||||||
|
}
|
||||||
|
|
||||||
|
function gg_sum_pythia_2_8b {
|
||||||
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
|
gg_printf 'Pythia 2.8B:\n'
|
||||||
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||||
|
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||||
|
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||||
|
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||||
|
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||||
|
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||||
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
|
}
|
||||||
|
|
||||||
# bge-small
|
# bge-small
|
||||||
|
|
||||||
function gg_run_embd_bge_small {
|
function gg_run_embd_bge_small {
|
||||||
|
@ -552,7 +688,7 @@ function gg_run_embd_bge_small {
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert-hf-to-gguf.py ${path_models}
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -606,9 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
test $ret -eq 0 && gg_run pythia_1_4b
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
test $ret -eq 0 && gg_run pythia_2_8b
|
||||||
|
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||||
fi
|
fi
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||||
|
|
1324
common/common.cpp
1324
common/common.cpp
File diff suppressed because it is too large
Load diff
|
@ -35,14 +35,18 @@
|
||||||
|
|
||||||
// build info
|
// build info
|
||||||
extern int LLAMA_BUILD_NUMBER;
|
extern int LLAMA_BUILD_NUMBER;
|
||||||
extern char const *LLAMA_COMMIT;
|
extern char const * LLAMA_COMMIT;
|
||||||
extern char const *LLAMA_COMPILER;
|
extern char const * LLAMA_COMPILER;
|
||||||
extern char const *LLAMA_BUILD_TARGET;
|
extern char const * LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
struct llama_control_vector_load_info;
|
struct llama_control_vector_load_info;
|
||||||
|
|
||||||
int get_math_cpu_count();
|
//
|
||||||
int32_t get_num_physical_cores();
|
// CPU utils
|
||||||
|
//
|
||||||
|
|
||||||
|
int32_t cpu_get_num_physical_cores();
|
||||||
|
int32_t cpu_get_num_math();
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
|
@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = get_math_cpu_count();
|
int32_t n_threads = cpu_get_num_math();
|
||||||
int32_t n_threads_draft = -1;
|
int32_t n_threads_draft = -1;
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
int32_t n_threads_batch_draft = -1;
|
int32_t n_threads_batch_draft = -1;
|
||||||
|
@ -179,33 +183,34 @@ struct gpt_params {
|
||||||
|
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
|
|
||||||
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||||
|
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
||||||
|
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
||||||
|
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
|
|
||||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
|
||||||
|
|
||||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
||||||
|
|
||||||
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
|
||||||
|
|
||||||
std::string get_system_info(const gpt_params & params);
|
|
||||||
|
|
||||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
|
||||||
|
|
||||||
void process_escapes(std::string& input);
|
|
||||||
|
|
||||||
bool validate_file_name(const std::string & filename);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
||||||
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
|
|
||||||
std::vector<std::string> string_split(std::string input, char separator);
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
|
||||||
std::string string_strip(const std::string & str);
|
std::string string_strip(const std::string & str);
|
||||||
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
std::string string_get_sortable_timestamp();
|
||||||
|
std::string string_random_prompt(std::mt19937 & rng);
|
||||||
|
|
||||||
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Filesystem utils
|
||||||
|
//
|
||||||
|
|
||||||
|
bool fs_validate_filename(const std::string & filename);
|
||||||
|
bool fs_create_directory_with_parents(const std::string & path);
|
||||||
|
|
||||||
|
std::string fs_get_cache_directory();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
@ -276,29 +281,15 @@ std::string llama_detokenize_bpe(
|
||||||
// defaults to true when model type is SPM, otherwise false.
|
// defaults to true when model type is SPM, otherwise false.
|
||||||
bool llama_should_add_bos_token(const llama_model * model);
|
bool llama_should_add_bos_token(const llama_model * model);
|
||||||
|
|
||||||
//
|
|
||||||
// YAML utils
|
|
||||||
//
|
|
||||||
|
|
||||||
bool create_directory_with_parents(const std::string & path);
|
|
||||||
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
||||||
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
||||||
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
||||||
std::string get_sortable_timestamp();
|
|
||||||
|
|
||||||
void dump_non_result_info_yaml(
|
|
||||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// Dump the KV cache view with the number of sequences per cell.
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
|
@ -332,6 +323,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
||||||
//
|
//
|
||||||
// Split utils
|
// Split utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
|
//
|
||||||
|
// YAML utils
|
||||||
|
//
|
||||||
|
|
||||||
|
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
||||||
|
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
||||||
|
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||||
|
|
||||||
|
void yaml_dump_non_result_info(
|
||||||
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
||||||
|
|
|
@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
std::string result = "CFG -> Penalties ";
|
std::string result = "CFG -> Penalties ";
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (auto sampler_type : params.samplers_sequence) {
|
for (auto sampler_type : params.samplers_sequence) {
|
||||||
const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
|
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
|
||||||
if (!sampler_type_name.empty()) {
|
if (!sampler_type_name.empty()) {
|
||||||
result += "-> " + sampler_type_name + " ";
|
result += "-> " + sampler_type_name + " ";
|
||||||
}
|
}
|
||||||
|
@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
||||||
|
switch (sampler_type) {
|
||||||
|
case llama_sampler_type::TOP_K: return "top_k";
|
||||||
|
case llama_sampler_type::TFS_Z: return "tfs_z";
|
||||||
|
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
||||||
|
case llama_sampler_type::TOP_P: return "top_p";
|
||||||
|
case llama_sampler_type::MIN_P: return "min_p";
|
||||||
|
case llama_sampler_type::TEMPERATURE: return "temperature";
|
||||||
|
default : return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||||
|
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
||||||
|
{"top_k", llama_sampler_type::TOP_K},
|
||||||
|
{"top_p", llama_sampler_type::TOP_P},
|
||||||
|
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"min_p", llama_sampler_type::MIN_P},
|
||||||
|
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||||
|
{"temperature", llama_sampler_type::TEMPERATURE}
|
||||||
|
};
|
||||||
|
|
||||||
|
// since samplers names are written multiple ways
|
||||||
|
// make it ready for both system names and input names
|
||||||
|
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
|
||||||
|
{"top-k", llama_sampler_type::TOP_K},
|
||||||
|
{"top-p", llama_sampler_type::TOP_P},
|
||||||
|
{"nucleus", llama_sampler_type::TOP_P},
|
||||||
|
{"typical-p", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"typical", llama_sampler_type::TYPICAL_P},
|
||||||
|
{"min-p", llama_sampler_type::MIN_P},
|
||||||
|
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||||
|
{"tfs", llama_sampler_type::TFS_Z},
|
||||||
|
{"temp", llama_sampler_type::TEMPERATURE}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
|
sampler_types.reserve(names.size());
|
||||||
|
for (const auto & name : names)
|
||||||
|
{
|
||||||
|
auto sampler_item = sampler_canonical_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_canonical_name_map.end())
|
||||||
|
{
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (allow_alt_names)
|
||||||
|
{
|
||||||
|
sampler_item = sampler_alt_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_alt_name_map.end())
|
||||||
|
{
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
|
||||||
|
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
||||||
|
{'k', llama_sampler_type::TOP_K},
|
||||||
|
{'p', llama_sampler_type::TOP_P},
|
||||||
|
{'y', llama_sampler_type::TYPICAL_P},
|
||||||
|
{'m', llama_sampler_type::MIN_P},
|
||||||
|
{'f', llama_sampler_type::TFS_Z},
|
||||||
|
{'t', llama_sampler_type::TEMPERATURE}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
|
sampler_types.reserve(names_string.size());
|
||||||
|
for (const auto & c : names_string) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(c);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
// no reasons to expose this function in header
|
// no reasons to expose this function in header
|
||||||
static void sampler_queue(
|
static void sampler_queue(
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
|
@ -179,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
const int idx,
|
const int idx,
|
||||||
bool is_resampling) { // Add a parameter to indicate if we are resampling
|
bool is_resampling) {
|
||||||
const llama_sampling_params & params = ctx_sampling->params;
|
const llama_sampling_params & params = ctx_sampling->params;
|
||||||
|
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
|
@ -188,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
const float mirostat_eta = params.mirostat_eta;
|
||||||
|
|
||||||
std::vector<float> original_logits;
|
std::vector<float> original_logits;
|
||||||
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
|
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
|
||||||
if (!is_resampling) {
|
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
||||||
GGML_ASSERT(!original_logits.empty());
|
GGML_ASSERT(!original_logits.empty());
|
||||||
}
|
}
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
|
@ -252,7 +333,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
// Restore logits from the copy
|
// Restore logits from the copy
|
||||||
std::copy(original_logits.begin(), original_logits.end(), logits);
|
std::copy(original_logits.begin(), original_logits.end(), logits);
|
||||||
|
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -285,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
||||||
// Get a pointer to the logits
|
// Get a pointer to the logits
|
||||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||||
|
|
||||||
if (apply_grammar && original_logits != NULL) {
|
if (ctx_sampling->grammar != NULL && !apply_grammar) {
|
||||||
|
GGML_ASSERT(original_logits != NULL);
|
||||||
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
||||||
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
|
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
|
||||||
}
|
}
|
||||||
|
@ -342,7 +424,7 @@ llama_token llama_sampling_sample(
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
const int idx) {
|
const int idx) {
|
||||||
// Call the implementation function with is_resampling set to false by default
|
// Call the implementation function with is_resampling set to false by default
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array llama_sampling_prepare(
|
llama_token_data_array llama_sampling_prepare(
|
||||||
|
|
|
@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
|
||||||
// Print sampling order into a string
|
// Print sampling order into a string
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
||||||
|
|
||||||
|
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||||
|
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
|
||||||
|
|
||||||
// this is a common sampling function used across the examples for convenience
|
// this is a common sampling function used across the examples for convenience
|
||||||
// it can serve as a starting point for implementing your own sampling function
|
// it can serve as a starting point for implementing your own sampling function
|
||||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||||
|
|
|
@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
|
||||||
|
|
||||||
void finish_processing_train_args(struct train_params_common * params) {
|
void finish_processing_train_args(struct train_params_common * params) {
|
||||||
if (params->escape) {
|
if (params->escape) {
|
||||||
process_escapes(params->sample_start);
|
string_process_escapes(params->sample_start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -72,7 +72,7 @@ models = [
|
||||||
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||||
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||||
{"name": "stablelm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
|
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
|
||||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||||
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
||||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||||
|
|
|
@ -14,6 +14,7 @@ from pathlib import Path
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
||||||
|
|
||||||
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
@ -447,7 +448,7 @@ class Model:
|
||||||
# ref: https://huggingface.co/openai-community/gpt2
|
# ref: https://huggingface.co/openai-community/gpt2
|
||||||
res = "gpt-2"
|
res = "gpt-2"
|
||||||
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
|
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
|
||||||
# ref: https://huggingface.co/stabilityai/stablelm-2-1_6b
|
# ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
|
||||||
res = "stablelm2"
|
res = "stablelm2"
|
||||||
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
|
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
|
||||||
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
|
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
|
||||||
|
@ -672,6 +673,44 @@ class GPTNeoXModel(Model):
|
||||||
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
||||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
|
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
|
||||||
|
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||||
|
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
||||||
|
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
||||||
|
qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
|
||||||
|
data_torch = torch.cat(
|
||||||
|
(
|
||||||
|
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
|
||||||
|
),
|
||||||
|
dim=0,
|
||||||
|
)
|
||||||
|
logger.info("re-format attention.linear_qkv.weight")
|
||||||
|
elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
|
||||||
|
qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
|
||||||
|
data_torch = torch.cat(
|
||||||
|
(
|
||||||
|
qkv_bias[:, 0, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 1, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 2, :].reshape((n_embed,)),
|
||||||
|
),
|
||||||
|
dim=0,
|
||||||
|
)
|
||||||
|
logger.info("re-format attention.linear_qkv.bias")
|
||||||
|
|
||||||
|
tensors.append((self.map_tensor_name(name), data_torch))
|
||||||
|
|
||||||
|
return tensors
|
||||||
|
|
||||||
|
|
||||||
@Model.register("BloomForCausalLM")
|
@Model.register("BloomForCausalLM")
|
||||||
class BloomModel(Model):
|
class BloomModel(Model):
|
||||||
|
@ -1749,7 +1788,7 @@ class Phi3MiniModel(Model):
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
token = foken_data["content"].encode("utf-8")
|
token = foken_data["content"].encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||||
assert(tokens[token_id] == token)
|
assert tokens[token_id] == token
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -1765,7 +1804,7 @@ class Phi3MiniModel(Model):
|
||||||
token_id = int(foken_data["id"])
|
token_id = int(foken_data["id"])
|
||||||
token = foken_data["content"].encode("utf-8")
|
token = foken_data["content"].encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
|
||||||
assert(tokens[token_id] == token)
|
assert tokens[token_id] == token
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -1784,23 +1823,59 @@ class Phi3MiniModel(Model):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
||||||
|
|
||||||
rot_pct = 1.0
|
|
||||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||||
|
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
||||||
rms_eps = self.find_hparam(["rms_norm_eps"])
|
rms_eps = self.find_hparam(["rms_norm_eps"])
|
||||||
|
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
||||||
|
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||||
|
rope_dims = n_embd // n_head
|
||||||
|
|
||||||
self.gguf_writer.add_name("Phi3")
|
self.gguf_writer.add_name("Phi3")
|
||||||
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
self.gguf_writer.add_context_length(max_pos_embds)
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
||||||
self.gguf_writer.add_embedding_length(n_embd)
|
self.gguf_writer.add_embedding_length(n_embd)
|
||||||
self.gguf_writer.add_feed_forward_length(8192)
|
self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_head_count(n_head)
|
self.gguf_writer.add_head_count(n_head)
|
||||||
self.gguf_writer.add_head_count_kv(n_head)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
||||||
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
# write rope scaling for long context (128k) model
|
||||||
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||||
|
if (rope_scaling is None):
|
||||||
|
return
|
||||||
|
|
||||||
|
scale = max_pos_embds / orig_max_pos_embds
|
||||||
|
|
||||||
|
rope_scaling_type = rope_scaling.get('type', '').lower()
|
||||||
|
if len(rope_scaling_type) == 0:
|
||||||
|
raise KeyError('Missing the required key rope_scaling.type')
|
||||||
|
|
||||||
|
if rope_scaling_type == 'su':
|
||||||
|
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
|
||||||
|
elif rope_scaling_type == 'yarn':
|
||||||
|
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
|
||||||
|
|
||||||
|
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
|
||||||
|
|
||||||
|
long_factors = rope_scaling.get('long_factor', None)
|
||||||
|
short_factors = rope_scaling.get('short_factor', None)
|
||||||
|
|
||||||
|
if long_factors is None or short_factors is None:
|
||||||
|
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||||
|
|
||||||
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||||
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
||||||
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
||||||
|
|
||||||
|
|
||||||
@Model.register("PlamoForCausalLM")
|
@Model.register("PlamoForCausalLM")
|
||||||
class PlamoModel(Model):
|
class PlamoModel(Model):
|
||||||
|
|
|
@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
}
|
}
|
||||||
|
|
||||||
process_escapes(params.prompt);
|
string_process_escapes(params.prompt);
|
||||||
|
|
||||||
// init LLM
|
// init LLM
|
||||||
|
|
||||||
|
|
|
@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.random_prompt) {
|
if (params.random_prompt) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = string_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
|
|
|
@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.random_prompt) {
|
if (params.random_prompt) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = string_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool OK = run(ctx, params);
|
bool OK = run(ctx, params);
|
||||||
|
|
|
@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
// not capturing these, to silcence warnings
|
// not capturing these, to silcence warnings
|
||||||
const int rope_mode = 0;
|
const int rope_mode = 0;
|
||||||
|
|
||||||
return ggml_rope_custom(ctx,
|
return ggml_rope_ext(ctx,
|
||||||
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
|
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
|
||||||
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
|
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
|
||||||
struct ggml_tensor * t16;
|
struct ggml_tensor * t16;
|
||||||
if (enable_flash_attn) {
|
if (enable_flash_attn) {
|
||||||
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
|
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
||||||
|
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
|
||||||
} else {
|
} else {
|
||||||
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
||||||
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
||||||
|
|
|
@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.random_prompt) {
|
if (params.random_prompt) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = string_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
sparams.dataset = params.prompt_file;
|
sparams.dataset = params.prompt_file;
|
||||||
|
@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
|
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
|
||||||
|
|
|
@ -50,9 +50,9 @@ static void write_logfile(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string timestamp = get_sortable_timestamp();
|
const std::string timestamp = string_get_sortable_timestamp();
|
||||||
|
|
||||||
const bool success = create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||||
__func__, params.logdir.c_str());
|
__func__, params.logdir.c_str());
|
||||||
|
@ -70,7 +70,7 @@ static void write_logfile(
|
||||||
fprintf(logfile, "binary: infill\n");
|
fprintf(logfile, "binary: infill\n");
|
||||||
char model_desc[128];
|
char model_desc[128];
|
||||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
||||||
|
|
||||||
fprintf(logfile, "\n");
|
fprintf(logfile, "\n");
|
||||||
fprintf(logfile, "######################\n");
|
fprintf(logfile, "######################\n");
|
||||||
|
@ -78,8 +78,8 @@ static void write_logfile(
|
||||||
fprintf(logfile, "######################\n");
|
fprintf(logfile, "######################\n");
|
||||||
fprintf(logfile, "\n");
|
fprintf(logfile, "\n");
|
||||||
|
|
||||||
dump_string_yaml_multiline(logfile, "output", output.c_str());
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||||
dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_dump_timing_info_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
|
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||||
|
@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
||||||
process_escapes(params.input_prefix);
|
string_process_escapes(params.input_prefix);
|
||||||
process_escapes(params.input_suffix);
|
string_process_escapes(params.input_suffix);
|
||||||
}
|
}
|
||||||
suff_rm_leading_spc = params.escape;
|
suff_rm_leading_spc = params.escape;
|
||||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||||
|
|
|
@ -195,12 +195,12 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
||||||
/* n_prompt */ {512},
|
/* n_prompt */ {512},
|
||||||
/* n_gen */ {128},
|
/* n_gen */ {128},
|
||||||
/* n_pg */ {{512, 128}},
|
/* n_pg */ {},
|
||||||
/* n_batch */ {2048},
|
/* n_batch */ {2048},
|
||||||
/* n_ubatch */ {512},
|
/* n_ubatch */ {512},
|
||||||
/* type_k */ {GGML_TYPE_F16},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {get_math_cpu_count()},
|
/* n_threads */ {cpu_get_num_math()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
|
|
|
@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||||
gpt_print_usage(argc, argv, params);
|
gpt_params_print_usage(argc, argv, params);
|
||||||
show_additional_info(argc, argv);
|
show_additional_info(argc, argv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
|
||||||
// debug
|
// debug
|
||||||
if (dump_kv_cache) {
|
if (dump_kv_cache) {
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
dump_kv_cache_view_seqs(kvc_view, 40);
|
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||||
}
|
}
|
||||||
|
|
||||||
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
|
|
|
@ -121,7 +121,7 @@ int main(int argc, char ** argv){
|
||||||
// debug
|
// debug
|
||||||
if (dump_kv_cache) {
|
if (dump_kv_cache) {
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
dump_kv_cache_view_seqs(kvc_view, 40);
|
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print current draft sequence
|
// print current draft sequence
|
||||||
|
|
|
@ -325,3 +325,5 @@ These options provide extra functionality and customization when running the LLa
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
|
|
||||||
|
- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.
|
||||||
|
|
|
@ -60,9 +60,9 @@ static void write_logfile(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string timestamp = get_sortable_timestamp();
|
const std::string timestamp = string_get_sortable_timestamp();
|
||||||
|
|
||||||
const bool success = create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||||
__func__, params.logdir.c_str());
|
__func__, params.logdir.c_str());
|
||||||
|
@ -80,7 +80,7 @@ static void write_logfile(
|
||||||
fprintf(logfile, "binary: main\n");
|
fprintf(logfile, "binary: main\n");
|
||||||
char model_desc[128];
|
char model_desc[128];
|
||||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
||||||
|
|
||||||
fprintf(logfile, "\n");
|
fprintf(logfile, "\n");
|
||||||
fprintf(logfile, "######################\n");
|
fprintf(logfile, "######################\n");
|
||||||
|
@ -88,8 +88,8 @@ static void write_logfile(
|
||||||
fprintf(logfile, "######################\n");
|
fprintf(logfile, "######################\n");
|
||||||
fprintf(logfile, "\n");
|
fprintf(logfile, "\n");
|
||||||
|
|
||||||
dump_string_yaml_multiline(logfile, "output", output.c_str());
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||||
dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_dump_timing_info_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
|
@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.random_prompt) {
|
if (params.random_prompt) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = string_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
LOG("%s: llama backend init\n", __func__);
|
||||||
|
@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string path_session = params.path_prompt_cache;
|
std::string path_session = params.path_prompt_cache;
|
||||||
|
@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
const char *control_message;
|
const char * control_message;
|
||||||
if (params.multiline_input) {
|
if (params.multiline_input) {
|
||||||
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
|
control_message = " - To return control to the AI, end your input with '\\'.\n"
|
||||||
" - To return control without starting a new line, end your input with '/'.\n";
|
" - To return control without starting a new line, end your input with '/'.\n";
|
||||||
} else {
|
} else {
|
||||||
control_message = " - Press Return to return control to LLaMa.\n"
|
control_message = " - Press Return to return control to the AI.\n"
|
||||||
" - To return control without starting a new line, end your input with '/'.\n"
|
" - To return control without starting a new line, end your input with '/'.\n"
|
||||||
" - If you want to submit another line, end your input with '\\'.\n";
|
" - If you want to submit another line, end your input with '\\'.\n";
|
||||||
}
|
}
|
||||||
|
@ -707,7 +707,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
|
||||||
|
|
||||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||||
|
|
||||||
|
@ -728,7 +728,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||||
// for the prompt, we don't apply grammar rules
|
// for the prompt, we don't apply grammar rules
|
||||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
|
@ -879,7 +879,7 @@ int main(int argc, char ** argv) {
|
||||||
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
||||||
}
|
}
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
process_escapes(buffer);
|
string_process_escapes(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||||
|
|
|
@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
|
||||||
while (true) {
|
while (true) {
|
||||||
if (dump_kv_cache) {
|
if (dump_kv_cache) {
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
dump_kv_cache_view_seqs(kvc_view, 40);
|
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
|
@ -44,9 +44,9 @@ static void write_logfile(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string timestamp = get_sortable_timestamp();
|
const std::string timestamp = string_get_sortable_timestamp();
|
||||||
|
|
||||||
const bool success = create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||||
__func__, params.logdir.c_str());
|
__func__, params.logdir.c_str());
|
||||||
|
@ -64,7 +64,7 @@ static void write_logfile(
|
||||||
fprintf(logfile, "binary: main\n");
|
fprintf(logfile, "binary: main\n");
|
||||||
char model_desc[128];
|
char model_desc[128];
|
||||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
|
yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
|
||||||
|
|
||||||
fprintf(logfile, "\n");
|
fprintf(logfile, "\n");
|
||||||
fprintf(logfile, "######################\n");
|
fprintf(logfile, "######################\n");
|
||||||
|
@ -72,9 +72,9 @@ static void write_logfile(
|
||||||
fprintf(logfile, "######################\n");
|
fprintf(logfile, "######################\n");
|
||||||
fprintf(logfile, "\n");
|
fprintf(logfile, "\n");
|
||||||
|
|
||||||
dump_vector_float_yaml(logfile, "logits", results.logits);
|
yaml_dump_vector_float(logfile, "logits", results.logits);
|
||||||
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
||||||
dump_vector_float_yaml(logfile, "probs", results.probs);
|
yaml_dump_vector_float(logfile, "probs", results.probs);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_dump_timing_info_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
|
@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.random_prompt) {
|
if (params.random_prompt) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = string_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
struct results_perplexity results;
|
struct results_perplexity results;
|
||||||
|
|
|
@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
|
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
|
||||||
if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||||
|
|
|
@ -11,7 +11,7 @@ struct retrieval_params {
|
||||||
};
|
};
|
||||||
|
|
||||||
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
|
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
|
||||||
gpt_print_usage(argc, argv, gpt_params);
|
gpt_params_print_usage(argc, argv, gpt_params);
|
||||||
printf("retrieval options:\n");
|
printf("retrieval options:\n");
|
||||||
printf(" --context-file FNAME file containing context to embed.\n");
|
printf(" --context-file FNAME file containing context to embed.\n");
|
||||||
printf(" specify multiple files by providing --context-file option multiple times.\n");
|
printf(" specify multiple files by providing --context-file option multiple times.\n");
|
||||||
|
@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
|
|
52
examples/server/public_simplechat/index.html
Normal file
52
examples/server/public_simplechat/index.html
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<title>SimpleChat (LlamaCPP, ...) </title>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<meta name="message" content="Save Nature Save Earth" />
|
||||||
|
<meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
|
||||||
|
<meta name="author" content="by Humans for All" />
|
||||||
|
<meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
|
||||||
|
<script src="simplechat.js" defer></script>
|
||||||
|
<link rel="stylesheet" href="simplechat.css" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="samecolumn" id="fullbody">
|
||||||
|
|
||||||
|
<div class="sameline">
|
||||||
|
<p class="heading flex-grow" > <b> SimpleChat </b> </p>
|
||||||
|
<div class="sameline">
|
||||||
|
<label for="api-ep">Mode:</label>
|
||||||
|
<select name="api-ep" id="api-ep">
|
||||||
|
<option value="chat" selected>Chat</option>
|
||||||
|
<option value="completion">Completion</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="sessions-div" class="sameline"></div>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
<div class="sameline">
|
||||||
|
<label for="system-in">System</label>
|
||||||
|
<input type="text" name="system" id="system-in" class="flex-grow"/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
<div id="chat-div">
|
||||||
|
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||||
|
<p> Enter your text to the ai assistant below.</p>
|
||||||
|
<p> Use shift+enter for inserting enter.</p>
|
||||||
|
<p> Refresh the page to start over fresh.</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
<div class="sameline">
|
||||||
|
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
|
||||||
|
<button id="user-btn">submit</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
81
examples/server/public_simplechat/readme.md
Normal file
81
examples/server/public_simplechat/readme.md
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
|
||||||
|
# SimpleChat
|
||||||
|
|
||||||
|
by Humans for All.
|
||||||
|
|
||||||
|
|
||||||
|
## overview
|
||||||
|
|
||||||
|
This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
|
||||||
|
in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
|
||||||
|
multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
|
||||||
|
own system prompts.
|
||||||
|
|
||||||
|
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
||||||
|
enough manner, in general.
|
||||||
|
|
||||||
|
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||||
|
culling of old messages from the chat.
|
||||||
|
|
||||||
|
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
|
||||||
|
the js file as needed.
|
||||||
|
|
||||||
|
|
||||||
|
## usage
|
||||||
|
|
||||||
|
One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
|
||||||
|
frontend to configure the server over http(s) or so, then run this web frontend using something like python's
|
||||||
|
http module.
|
||||||
|
|
||||||
|
### running using examples/server
|
||||||
|
|
||||||
|
bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
|
||||||
|
|
||||||
|
### running using python3's server module
|
||||||
|
|
||||||
|
first run examples/server
|
||||||
|
* bin/server -m path/model.gguf
|
||||||
|
|
||||||
|
next run this web front end in examples/server/public_simplechat
|
||||||
|
* cd ../examples/server/public_simplechat
|
||||||
|
* python3 -m http.server PORT
|
||||||
|
|
||||||
|
### using the front end
|
||||||
|
|
||||||
|
Open this simple web front end from your local browser
|
||||||
|
* http://127.0.0.1:PORT/index.html
|
||||||
|
|
||||||
|
Once inside
|
||||||
|
* Select between chat and completion mode. By default it is set to chat mode.
|
||||||
|
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
||||||
|
* if chat.add_system_begin is used
|
||||||
|
* you cant change the system prompt, after it is has been submitted once along with user query.
|
||||||
|
* you cant set a system prompt, after you have submitted any user query
|
||||||
|
* if chat.add_system_anytime is used
|
||||||
|
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
||||||
|
* inturn the updated/changed system prompt will be inserted into the chat session.
|
||||||
|
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
||||||
|
* Enter your query and either press enter or click on the submit button.
|
||||||
|
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
||||||
|
* Wait for the logic to communicate with the server and get the response.
|
||||||
|
* the user is not allowed to enter any fresh query during this time.
|
||||||
|
* the user input box will be disabled and a working message will be shown in it.
|
||||||
|
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
||||||
|
* Using NewChat one can start independent chat sessions.
|
||||||
|
* two independent chat sessions are setup by default.
|
||||||
|
|
||||||
|
|
||||||
|
## Devel note
|
||||||
|
|
||||||
|
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||||
|
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
||||||
|
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
||||||
|
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
||||||
|
|
||||||
|
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
||||||
|
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
||||||
|
its instances relatively easily, however given the current goal of keeping this simple, it has
|
||||||
|
not been added, for now.
|
||||||
|
|
||||||
|
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
||||||
|
the system prompt, anytime during the conversation or only at the beginning.
|
61
examples/server/public_simplechat/simplechat.css
Normal file
61
examples/server/public_simplechat/simplechat.css
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
/**
|
||||||
|
* the styling of the simplechat web frontend
|
||||||
|
* by Humans for All
|
||||||
|
*/
|
||||||
|
|
||||||
|
#fullbody {
|
||||||
|
height: 98vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
.heading {
|
||||||
|
background-color: lightgray;
|
||||||
|
}
|
||||||
|
|
||||||
|
.session-selected {
|
||||||
|
background-color: lightblue;
|
||||||
|
}
|
||||||
|
|
||||||
|
.role-system {
|
||||||
|
background-color: lightblue;
|
||||||
|
}
|
||||||
|
.role-user {
|
||||||
|
background-color: lightgray;
|
||||||
|
}
|
||||||
|
|
||||||
|
.flex-grow {
|
||||||
|
flex-grow: 1;
|
||||||
|
}
|
||||||
|
.float-right {
|
||||||
|
float: right;
|
||||||
|
}
|
||||||
|
|
||||||
|
#chat-div {
|
||||||
|
overflow: scroll;
|
||||||
|
flex-grow: 1;
|
||||||
|
flex-shrink: 1;
|
||||||
|
min-height: 40vh;
|
||||||
|
}
|
||||||
|
button {
|
||||||
|
min-width: 8vw;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sameline {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: row;
|
||||||
|
}
|
||||||
|
.samecolumn {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
}
|
||||||
|
|
||||||
|
* {
|
||||||
|
margin: 0.6vmin;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media print {
|
||||||
|
|
||||||
|
#fullbody {
|
||||||
|
height: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
478
examples/server/public_simplechat/simplechat.js
Normal file
478
examples/server/public_simplechat/simplechat.js
Normal file
|
@ -0,0 +1,478 @@
|
||||||
|
// @ts-check
|
||||||
|
// A simple completions and chat/completions test related web front end logic
|
||||||
|
// by Humans for All
|
||||||
|
|
||||||
|
class Roles {
|
||||||
|
static System = "system";
|
||||||
|
static User = "user";
|
||||||
|
static Assistant = "assistant";
|
||||||
|
}
|
||||||
|
|
||||||
|
class ApiEP {
|
||||||
|
static Chat = "chat";
|
||||||
|
static Completion = "completion";
|
||||||
|
}
|
||||||
|
|
||||||
|
let gUsageMsg = `
|
||||||
|
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||||
|
<p> Enter your text to the ai assistant below.</p>
|
||||||
|
<p> Use shift+enter for inserting enter.</p>
|
||||||
|
<p> Refresh the page to start over fresh.</p>
|
||||||
|
`;
|
||||||
|
|
||||||
|
class SimpleChat {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
/**
|
||||||
|
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
||||||
|
* @type {{role: string, content: string}[]}
|
||||||
|
*/
|
||||||
|
this.xchat = [];
|
||||||
|
this.iLastSys = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add an entry into xchat
|
||||||
|
* @param {string} role
|
||||||
|
* @param {string|undefined|null} content
|
||||||
|
*/
|
||||||
|
add(role, content) {
|
||||||
|
if ((content == undefined) || (content == null) || (content == "")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
this.xchat.push( {role: role, content: content} );
|
||||||
|
if (role == Roles.System) {
|
||||||
|
this.iLastSys = this.xchat.length - 1;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Show the contents in the specified div
|
||||||
|
* @param {HTMLDivElement} div
|
||||||
|
* @param {boolean} bClear
|
||||||
|
*/
|
||||||
|
show(div, bClear=true) {
|
||||||
|
if (bClear) {
|
||||||
|
div.replaceChildren();
|
||||||
|
}
|
||||||
|
let last = undefined;
|
||||||
|
for(const x of this.xchat) {
|
||||||
|
let entry = document.createElement("p");
|
||||||
|
entry.className = `role-${x.role}`;
|
||||||
|
entry.innerText = `${x.role}: ${x.content}`;
|
||||||
|
div.appendChild(entry);
|
||||||
|
last = entry;
|
||||||
|
}
|
||||||
|
if (last !== undefined) {
|
||||||
|
last.scrollIntoView(false);
|
||||||
|
} else {
|
||||||
|
if (bClear) {
|
||||||
|
div.innerHTML = gUsageMsg;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
|
||||||
|
* Convert the json into string.
|
||||||
|
* @param {Object} obj
|
||||||
|
*/
|
||||||
|
request_jsonstr(obj) {
|
||||||
|
obj["temperature"] = 0.7;
|
||||||
|
return JSON.stringify(obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a string form of json object suitable for chat/completions
|
||||||
|
*/
|
||||||
|
request_messages_jsonstr() {
|
||||||
|
let req = {
|
||||||
|
messages: this.xchat,
|
||||||
|
}
|
||||||
|
return this.request_jsonstr(req);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a string form of json object suitable for /completions
|
||||||
|
*/
|
||||||
|
request_prompt_jsonstr() {
|
||||||
|
let prompt = "";
|
||||||
|
for(const chat of this.xchat) {
|
||||||
|
prompt += `${chat.role}: ${chat.content}\n`;
|
||||||
|
}
|
||||||
|
let req = {
|
||||||
|
prompt: prompt,
|
||||||
|
}
|
||||||
|
return this.request_jsonstr(req);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allow setting of system prompt, but only at begining.
|
||||||
|
* @param {string} sysPrompt
|
||||||
|
* @param {string} msgTag
|
||||||
|
*/
|
||||||
|
add_system_begin(sysPrompt, msgTag) {
|
||||||
|
if (this.xchat.length == 0) {
|
||||||
|
if (sysPrompt.length > 0) {
|
||||||
|
return this.add(Roles.System, sysPrompt);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (sysPrompt.length > 0) {
|
||||||
|
if (this.xchat[0].role !== Roles.System) {
|
||||||
|
console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
|
||||||
|
} else {
|
||||||
|
if (this.xchat[0].content !== sysPrompt) {
|
||||||
|
console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allow setting of system prompt, at any time.
|
||||||
|
* @param {string} sysPrompt
|
||||||
|
* @param {string} msgTag
|
||||||
|
*/
|
||||||
|
add_system_anytime(sysPrompt, msgTag) {
|
||||||
|
if (sysPrompt.length <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.iLastSys < 0) {
|
||||||
|
return this.add(Roles.System, sysPrompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
let lastSys = this.xchat[this.iLastSys].content;
|
||||||
|
if (lastSys !== sysPrompt) {
|
||||||
|
return this.add(Roles.System, sysPrompt);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieve the latest system prompt.
|
||||||
|
*/
|
||||||
|
get_system_latest() {
|
||||||
|
if (this.iLastSys == -1) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
let sysPrompt = this.xchat[this.iLastSys].content;
|
||||||
|
return sysPrompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
let gBaseURL = "http://127.0.0.1:8080";
|
||||||
|
let gChatURL = {
|
||||||
|
'chat': `${gBaseURL}/chat/completions`,
|
||||||
|
'completion': `${gBaseURL}/completions`,
|
||||||
|
}
|
||||||
|
const gbCompletionFreshChatAlways = true;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the class of the children, based on whether it is the idSelected or not.
|
||||||
|
* @param {HTMLDivElement} elBase
|
||||||
|
* @param {string} idSelected
|
||||||
|
* @param {string} classSelected
|
||||||
|
* @param {string} classUnSelected
|
||||||
|
*/
|
||||||
|
function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
|
||||||
|
for(let child of elBase.children) {
|
||||||
|
if (child.id == idSelected) {
|
||||||
|
child.className = classSelected;
|
||||||
|
} else {
|
||||||
|
child.className = classUnSelected;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create button and set it up.
|
||||||
|
* @param {string} id
|
||||||
|
* @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
|
||||||
|
* @param {string | undefined} name
|
||||||
|
* @param {string | undefined} innerText
|
||||||
|
*/
|
||||||
|
function el_create_button(id, callback, name=undefined, innerText=undefined) {
|
||||||
|
if (!name) {
|
||||||
|
name = id;
|
||||||
|
}
|
||||||
|
if (!innerText) {
|
||||||
|
innerText = id;
|
||||||
|
}
|
||||||
|
let btn = document.createElement("button");
|
||||||
|
btn.id = id;
|
||||||
|
btn.name = name;
|
||||||
|
btn.innerText = innerText;
|
||||||
|
btn.addEventListener("click", callback);
|
||||||
|
return btn;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class MultiChatUI {
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
/** @type {Object<string, SimpleChat>} */
|
||||||
|
this.simpleChats = {};
|
||||||
|
/** @type {string} */
|
||||||
|
this.curChatId = "";
|
||||||
|
|
||||||
|
// the ui elements
|
||||||
|
this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in"));
|
||||||
|
this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div"));
|
||||||
|
this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn"));
|
||||||
|
this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in"));
|
||||||
|
this.elSelectApiEP = /** @type{HTMLSelectElement} */(document.getElementById("api-ep"));
|
||||||
|
this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div"));
|
||||||
|
|
||||||
|
this.validate_element(this.elInSystem, "system-in");
|
||||||
|
this.validate_element(this.elDivChat, "chat-div");
|
||||||
|
this.validate_element(this.elInUser, "user-in");
|
||||||
|
this.validate_element(this.elSelectApiEP, "api-ep");
|
||||||
|
this.validate_element(this.elDivChat, "sessions-div");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the element got
|
||||||
|
* @param {HTMLElement | null} el
|
||||||
|
* @param {string} msgTag
|
||||||
|
*/
|
||||||
|
validate_element(el, msgTag) {
|
||||||
|
if (el == null) {
|
||||||
|
throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`);
|
||||||
|
} else {
|
||||||
|
console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset user input ui.
|
||||||
|
* * clear user input
|
||||||
|
* * enable user input
|
||||||
|
* * set focus to user input
|
||||||
|
*/
|
||||||
|
ui_reset_userinput() {
|
||||||
|
this.elInUser.value = "";
|
||||||
|
this.elInUser.disabled = false;
|
||||||
|
this.elInUser.focus();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Setup the needed callbacks wrt UI, curChatId to defaultChatId and
|
||||||
|
* optionally switch to specified defaultChatId.
|
||||||
|
* @param {string} defaultChatId
|
||||||
|
* @param {boolean} bSwitchSession
|
||||||
|
*/
|
||||||
|
setup_ui(defaultChatId, bSwitchSession=false) {
|
||||||
|
|
||||||
|
this.curChatId = defaultChatId;
|
||||||
|
if (bSwitchSession) {
|
||||||
|
this.handle_session_switch(this.curChatId);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.elBtnUser.addEventListener("click", (ev)=>{
|
||||||
|
if (this.elInUser.disabled) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.handle_user_submit(this.curChatId, this.elSelectApiEP.value).catch((/** @type{Error} */reason)=>{
|
||||||
|
let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`;
|
||||||
|
console.debug(msg.replace("\n", ":"));
|
||||||
|
alert(msg);
|
||||||
|
this.ui_reset_userinput();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
this.elInUser.addEventListener("keyup", (ev)=> {
|
||||||
|
// allow user to insert enter into their message using shift+enter.
|
||||||
|
// while just pressing enter key will lead to submitting.
|
||||||
|
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||||
|
this.elBtnUser.click();
|
||||||
|
ev.preventDefault();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
this.elInSystem.addEventListener("keyup", (ev)=> {
|
||||||
|
// allow user to insert enter into the system prompt using shift+enter.
|
||||||
|
// while just pressing enter key will lead to setting the system prompt.
|
||||||
|
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||||
|
let chat = this.simpleChats[this.curChatId];
|
||||||
|
chat.add_system_anytime(this.elInSystem.value, this.curChatId);
|
||||||
|
chat.show(this.elDivChat);
|
||||||
|
ev.preventDefault();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Setup a new chat session and optionally switch to it.
|
||||||
|
* @param {string} chatId
|
||||||
|
* @param {boolean} bSwitchSession
|
||||||
|
*/
|
||||||
|
new_chat_session(chatId, bSwitchSession=false) {
|
||||||
|
this.simpleChats[chatId] = new SimpleChat();
|
||||||
|
if (bSwitchSession) {
|
||||||
|
this.handle_session_switch(chatId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handle user query submit request, wrt specified chat session.
|
||||||
|
* @param {string} chatId
|
||||||
|
* @param {string} apiEP
|
||||||
|
*/
|
||||||
|
async handle_user_submit(chatId, apiEP) {
|
||||||
|
|
||||||
|
let chat = this.simpleChats[chatId];
|
||||||
|
|
||||||
|
chat.add_system_anytime(this.elInSystem.value, chatId);
|
||||||
|
|
||||||
|
let content = this.elInUser.value;
|
||||||
|
if (!chat.add(Roles.User, content)) {
|
||||||
|
console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
chat.show(this.elDivChat);
|
||||||
|
|
||||||
|
let theBody;
|
||||||
|
let theUrl = gChatURL[apiEP]
|
||||||
|
if (apiEP == ApiEP.Chat) {
|
||||||
|
theBody = chat.request_messages_jsonstr();
|
||||||
|
} else {
|
||||||
|
theBody = chat.request_prompt_jsonstr();
|
||||||
|
}
|
||||||
|
|
||||||
|
this.elInUser.value = "working...";
|
||||||
|
this.elInUser.disabled = true;
|
||||||
|
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`);
|
||||||
|
let resp = await fetch(theUrl, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: theBody,
|
||||||
|
});
|
||||||
|
|
||||||
|
let respBody = await resp.json();
|
||||||
|
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
||||||
|
let assistantMsg;
|
||||||
|
if (apiEP == ApiEP.Chat) {
|
||||||
|
assistantMsg = respBody["choices"][0]["message"]["content"];
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
assistantMsg = respBody["choices"][0]["text"];
|
||||||
|
} catch {
|
||||||
|
assistantMsg = respBody["content"];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
chat.add(Roles.Assistant, assistantMsg);
|
||||||
|
if (chatId == this.curChatId) {
|
||||||
|
chat.show(this.elDivChat);
|
||||||
|
} else {
|
||||||
|
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
||||||
|
}
|
||||||
|
// Purposefully clear at end rather than begin of this function
|
||||||
|
// so that one can switch from chat to completion mode and sequece
|
||||||
|
// in a completion mode with multiple user-assistant chat data
|
||||||
|
// from before to be sent/occur once.
|
||||||
|
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
|
||||||
|
chat.xchat.length = 0;
|
||||||
|
}
|
||||||
|
this.ui_reset_userinput();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Show buttons for NewChat and available chat sessions, in the passed elDiv.
|
||||||
|
* If elDiv is undefined/null, then use this.elDivSessions.
|
||||||
|
* Take care of highlighting the selected chat-session's btn.
|
||||||
|
* @param {HTMLDivElement | undefined} elDiv
|
||||||
|
*/
|
||||||
|
show_sessions(elDiv=undefined) {
|
||||||
|
if (!elDiv) {
|
||||||
|
elDiv = this.elDivSessions;
|
||||||
|
}
|
||||||
|
elDiv.replaceChildren();
|
||||||
|
// Btn for creating new chat session
|
||||||
|
let btnNew = el_create_button("New CHAT", (ev)=> {
|
||||||
|
if (this.elInUser.disabled) {
|
||||||
|
console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`);
|
||||||
|
alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let chatId = `Chat${Object.keys(this.simpleChats).length}`;
|
||||||
|
let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId);
|
||||||
|
if (!chatIdGot) {
|
||||||
|
console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request...");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.new_chat_session(chatIdGot, true);
|
||||||
|
this.create_session_btn(elDiv, chatIdGot);
|
||||||
|
el_children_config_class(elDiv, chatIdGot, "session-selected", "");
|
||||||
|
});
|
||||||
|
elDiv.appendChild(btnNew);
|
||||||
|
// Btns for existing chat sessions
|
||||||
|
let chatIds = Object.keys(this.simpleChats);
|
||||||
|
for(let cid of chatIds) {
|
||||||
|
let btn = this.create_session_btn(elDiv, cid);
|
||||||
|
if (cid == this.curChatId) {
|
||||||
|
btn.className = "session-selected";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
create_session_btn(elDiv, cid) {
|
||||||
|
let btn = el_create_button(cid, (ev)=>{
|
||||||
|
let target = /** @type{HTMLButtonElement} */(ev.target);
|
||||||
|
console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`);
|
||||||
|
if (this.elInUser.disabled) {
|
||||||
|
console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`);
|
||||||
|
alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.handle_session_switch(target.id);
|
||||||
|
el_children_config_class(elDiv, target.id, "session-selected", "");
|
||||||
|
});
|
||||||
|
elDiv.appendChild(btn);
|
||||||
|
return btn;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Switch ui to the specified chatId and set curChatId to same.
|
||||||
|
* @param {string} chatId
|
||||||
|
*/
|
||||||
|
async handle_session_switch(chatId) {
|
||||||
|
let chat = this.simpleChats[chatId];
|
||||||
|
if (chat == undefined) {
|
||||||
|
console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.elInSystem.value = chat.get_system_latest();
|
||||||
|
this.elInUser.value = "";
|
||||||
|
chat.show(this.elDivChat);
|
||||||
|
this.elInUser.focus();
|
||||||
|
this.curChatId = chatId;
|
||||||
|
console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
let gMuitChat;
|
||||||
|
const gChatIds = [ "Default", "Other" ];
|
||||||
|
|
||||||
|
function startme() {
|
||||||
|
console.log("INFO:SimpleChat:StartMe:Starting...");
|
||||||
|
gMuitChat = new MultiChatUI();
|
||||||
|
for (let cid of gChatIds) {
|
||||||
|
gMuitChat.new_chat_session(cid);
|
||||||
|
}
|
||||||
|
gMuitChat.setup_ui(gChatIds[0]);
|
||||||
|
gMuitChat.show_sessions();
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener("DOMContentLoaded", startme);
|
|
@ -1019,7 +1019,7 @@ struct server_context {
|
||||||
sampler_names.emplace_back(sampler_name);
|
sampler_names.emplace_back(sampler_name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
|
slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
|
||||||
} else {
|
} else {
|
||||||
slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
|
slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
|
||||||
}
|
}
|
||||||
|
@ -1256,7 +1256,7 @@ struct server_context {
|
||||||
std::vector<std::string> samplers_sequence;
|
std::vector<std::string> samplers_sequence;
|
||||||
samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
|
samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
|
||||||
for (const auto & sampler_type : slot.sparams.samplers_sequence) {
|
for (const auto & sampler_type : slot.sparams.samplers_sequence) {
|
||||||
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
|
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
|
||||||
}
|
}
|
||||||
|
|
||||||
return json {
|
return json {
|
||||||
|
@ -2852,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!parse_kv_override(argv[i], params.kv_overrides)) {
|
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
||||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
|
@ -3310,7 +3310,7 @@ int main(int argc, char ** argv) {
|
||||||
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||||
json request_data = json::parse(req.body);
|
json request_data = json::parse(req.body);
|
||||||
std::string filename = request_data.at("filename");
|
std::string filename = request_data.at("filename");
|
||||||
if (!validate_file_name(filename)) {
|
if (!fs_validate_filename(filename)) {
|
||||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -3340,7 +3340,7 @@ int main(int argc, char ** argv) {
|
||||||
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||||
json request_data = json::parse(req.body);
|
json request_data = json::parse(req.body);
|
||||||
std::string filename = request_data.at("filename");
|
std::string filename = request_data.at("filename");
|
||||||
if (!validate_file_name(filename)) {
|
if (!fs_validate_filename(filename)) {
|
||||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,8 +37,8 @@ Feature: llama.cpp server
|
||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
||||||
| I believe the meaning of life is | 8 | (read\|going\|pretty)+ | 18 | 8 | not |
|
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
|
||||||
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 45 | 64 | not |
|
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
|
||||||
|
|
||||||
Scenario: Completion prompt truncated
|
Scenario: Completion prompt truncated
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
|
@ -67,8 +67,8 @@ Feature: llama.cpp server
|
||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
||||||
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 76 | 8 | disabled | not |
|
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
|
||||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|fireplace)+ | -1 | 64 | enabled | |
|
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
|
||||||
|
|
||||||
|
|
||||||
Scenario Outline: OAI Compatibility w/ response format
|
Scenario Outline: OAI Compatibility w/ response format
|
||||||
|
@ -84,7 +84,7 @@ Feature: llama.cpp server
|
||||||
| response_format | n_predicted | re_content |
|
| response_format | n_predicted | re_content |
|
||||||
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
|
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
|
||||||
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
||||||
| {"type": "json_object"} | 10 | \{ " Saragine. |
|
| {"type": "json_object"} | 10 | \{ " Jacky. |
|
||||||
|
|
||||||
|
|
||||||
Scenario: Tokenize / Detokenize
|
Scenario: Tokenize / Detokenize
|
||||||
|
|
|
@ -26,7 +26,7 @@ Feature: llama.cpp server slot management
|
||||||
# Since we have cache, this should only process the last tokens
|
# Since we have cache, this should only process the last tokens
|
||||||
Given a user prompt "What is the capital of Germany?"
|
Given a user prompt "What is the capital of Germany?"
|
||||||
And a completion request with no api error
|
And a completion request with no api error
|
||||||
Then 24 tokens are predicted matching (Thank|special|Lily)
|
Then 24 tokens are predicted matching (Thank|special)
|
||||||
And 7 prompt tokens are processed
|
And 7 prompt tokens are processed
|
||||||
# Loading the original cache into slot 0,
|
# Loading the original cache into slot 0,
|
||||||
# we should only be processing 1 prompt token and get the same output
|
# we should only be processing 1 prompt token and get the same output
|
||||||
|
@ -41,7 +41,7 @@ Feature: llama.cpp server slot management
|
||||||
Given a user prompt "What is the capital of Germany?"
|
Given a user prompt "What is the capital of Germany?"
|
||||||
And using slot id 1
|
And using slot id 1
|
||||||
And a completion request with no api error
|
And a completion request with no api error
|
||||||
Then 24 tokens are predicted matching (Thank|special|Lily)
|
Then 24 tokens are predicted matching (Thank|special)
|
||||||
And 1 prompt tokens are processed
|
And 1 prompt tokens are processed
|
||||||
|
|
||||||
Scenario: Erase Slot
|
Scenario: Erase Slot
|
||||||
|
|
|
@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
:: for FP16
|
:: for FP16
|
||||||
:: faster for long-prompt inference
|
:: faster for long-prompt inference
|
||||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
:: for FP32
|
:: for FP32
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
:: build example/main only
|
:: build example/main only
|
||||||
:: make main
|
:: make main
|
||||||
|
|
|
@ -301,8 +301,8 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||||
// not capturing these, to silcence warnings
|
// not capturing these, to silcence warnings
|
||||||
const int rope_mode = 0;
|
const int rope_mode = 0;
|
||||||
|
|
||||||
return ggml_rope_custom(
|
return ggml_rope_ext(
|
||||||
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -341,7 +341,8 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||||
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
|
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
|
||||||
struct ggml_tensor * t16;
|
struct ggml_tensor * t16;
|
||||||
if (enable_flash_attn) {
|
if (enable_flash_attn) {
|
||||||
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
|
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
||||||
|
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
|
||||||
} else {
|
} else {
|
||||||
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
||||||
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
||||||
|
|
|
@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
||||||
// QK = number of values after dequantization
|
// QK = number of values after dequantization
|
||||||
// QK_K = super-block size
|
// QK_K = super-block size
|
||||||
|
|
||||||
#ifdef GGML_QKK_64
|
|
||||||
#define QK_K 64
|
|
||||||
#define K_SCALE_SIZE 4
|
|
||||||
#else
|
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
#define K_SCALE_SIZE 12
|
#define K_SCALE_SIZE 12
|
||||||
#endif // GGML_QKK_64
|
|
||||||
|
|
||||||
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
||||||
// QR = QK / number of values before dequantization
|
// QR = QK / number of values before dequantization
|
||||||
|
@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
||||||
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
||||||
#define QR4_NL 2
|
#define QR4_NL 2
|
||||||
|
|
||||||
#if QK_K == 64
|
|
||||||
#define QI4_XS QI4_NL
|
|
||||||
#define QR4_XS QR4_NL
|
|
||||||
#else
|
|
||||||
#define QI4_XS (QK_K / (4*QR4_XS))
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
||||||
#define QR4_XS 8
|
#define QR4_XS 8
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||||
|
|
||||||
|
@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
// 16 blocks of 16 elements each
|
// 16 blocks of 16 elements each
|
||||||
// Effectively 3.4375 bits per weight
|
// Effectively 3.4375 bits per weight
|
||||||
#ifdef GGML_QKK_64
|
|
||||||
typedef struct {
|
|
||||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
|
||||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
|
||||||
uint8_t scales[2];
|
|
||||||
ggml_half d; // super-block scale
|
|
||||||
} block_q3_K;
|
|
||||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
|
||||||
#else
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||||
|
@ -244,20 +225,11 @@ typedef struct {
|
||||||
ggml_half d; // super-block scale
|
ggml_half d; // super-block scale
|
||||||
} block_q3_K;
|
} block_q3_K;
|
||||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
||||||
#endif
|
|
||||||
|
|
||||||
// 4-bit quantization
|
// 4-bit quantization
|
||||||
// 8 blocks of 32 elements each
|
// 8 blocks of 32 elements each
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// Effectively 4.5 bits per weight
|
// Effectively 4.5 bits per weight
|
||||||
#ifdef GGML_QKK_64
|
|
||||||
typedef struct {
|
|
||||||
ggml_half d[2]; // super-block scales/mins
|
|
||||||
uint8_t scales[2]; // 4-bit block scales/mins
|
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
|
||||||
} block_q4_K;
|
|
||||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
|
||||||
#else
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
union {
|
union {
|
||||||
struct {
|
struct {
|
||||||
|
@ -270,21 +242,11 @@ typedef struct {
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_K;
|
} block_q4_K;
|
||||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
||||||
#endif
|
|
||||||
|
|
||||||
// 5-bit quantization
|
// 5-bit quantization
|
||||||
// 8 blocks of 32 elements each
|
// 8 blocks of 32 elements each
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// Effectively 5.5 bits per weight
|
// Effectively 5.5 bits per weight
|
||||||
#ifdef GGML_QKK_64
|
|
||||||
typedef struct {
|
|
||||||
ggml_half d; // super-block scale
|
|
||||||
int8_t scales[QK_K/16]; // 8-bit block scales
|
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
|
||||||
} block_q5_K;
|
|
||||||
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
|
||||||
#else
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
union {
|
union {
|
||||||
struct {
|
struct {
|
||||||
|
@ -298,7 +260,6 @@ typedef struct {
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
} block_q5_K;
|
} block_q5_K;
|
||||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||||
#endif
|
|
||||||
|
|
||||||
// 6-bit quantization
|
// 6-bit quantization
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
|
@ -356,11 +317,7 @@ typedef struct {
|
||||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||||
|
|
||||||
// 3.4375 bpw
|
// 3.4375 bpw
|
||||||
#if QK_K == 64
|
|
||||||
#define IQ3S_N_SCALE 2
|
|
||||||
#else
|
|
||||||
#define IQ3S_N_SCALE QK_K/64
|
#define IQ3S_N_SCALE QK_K/64
|
||||||
#endif
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d;
|
ggml_half d;
|
||||||
uint8_t qs[QK_K/4];
|
uint8_t qs[QK_K/4];
|
||||||
|
@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
||||||
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
||||||
#if QK_K == 64
|
|
||||||
ggml_half d;
|
|
||||||
#endif
|
|
||||||
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
||||||
} block_iq1_m;
|
} block_iq1_m;
|
||||||
#if QK_K == 64
|
|
||||||
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
|
||||||
#else
|
|
||||||
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
||||||
#endif
|
|
||||||
|
|
||||||
// Used by IQ1_M quants
|
// Used by IQ1_M quants
|
||||||
typedef union {
|
typedef union {
|
||||||
|
@ -406,9 +356,6 @@ typedef struct {
|
||||||
} block_iq4_nl;
|
} block_iq4_nl;
|
||||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||||
|
|
||||||
#if QK_K == 64
|
|
||||||
#define block_iq4_xs block_iq4_nl
|
|
||||||
#else
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d;
|
ggml_half d;
|
||||||
uint16_t scales_h;
|
uint16_t scales_h;
|
||||||
|
@ -416,7 +363,6 @@ typedef struct {
|
||||||
uint8_t qs[QK_K/2];
|
uint8_t qs[QK_K/2];
|
||||||
} block_iq4_xs;
|
} block_iq4_xs;
|
||||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // GGML_COMMON_DECL
|
#endif // GGML_COMMON_DECL
|
||||||
#endif // GGML_COMMON_DECL
|
#endif // GGML_COMMON_DECL
|
||||||
|
|
|
@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||||
const block_q2_K * x = (const block_q2_K *) vx;
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t n = tid/32;
|
const int64_t n = tid/32;
|
||||||
const int64_t l = tid - 32*n;
|
const int64_t l = tid - 32*n;
|
||||||
const int64_t is = 8*n + l/16;
|
const int64_t is = 8*n + l/16;
|
||||||
|
@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||||
#else
|
|
||||||
const int64_t is = tid/16; // 0 or 1
|
|
||||||
const int64_t il = tid%16; // 0...15
|
|
||||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
|
||||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
|
||||||
float dall = __low2half(x[i].dm);
|
|
||||||
float dmin = __high2half(x[i].dm);
|
|
||||||
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
|
||||||
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
||||||
const int64_t i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
const block_q3_K * x = (const block_q3_K *) vx;
|
const block_q3_K * x = (const block_q3_K *) vx;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t r = threadIdx.x/4;
|
const int64_t r = threadIdx.x/4;
|
||||||
const int64_t tid = r/2;
|
const int64_t tid = r/2;
|
||||||
const int64_t is0 = r%2;
|
const int64_t is0 = r%2;
|
||||||
|
@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
||||||
const uint8_t * hm = x[i].hmask;
|
const uint8_t * hm = x[i].hmask;
|
||||||
|
|
||||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||||
#else
|
|
||||||
const int64_t tid = threadIdx.x;
|
|
||||||
const int64_t is = tid/16; // 0 or 1
|
|
||||||
const int64_t il = tid%16; // 0...15
|
|
||||||
const int64_t im = il/8; // 0...1
|
|
||||||
const int64_t in = il%8; // 0...7
|
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
|
||||||
|
|
||||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
|
||||||
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
|
||||||
const float d = (float)x[i].d;
|
|
||||||
|
|
||||||
if (is == 0) {
|
|
||||||
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
|
||||||
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
|
||||||
} else {
|
|
||||||
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
|
||||||
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||||
if (j < 4) {
|
if (j < 4) {
|
||||||
d = q[j] & 63; m = q[j + 4] & 63;
|
d = q[j] & 63; m = q[j + 4] & 63;
|
||||||
|
@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
||||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||||
|
@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
||||||
|
|
||||||
const int64_t i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
const int64_t il = tid/8;
|
const int64_t il = tid/8;
|
||||||
|
@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
||||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
const int64_t tid = threadIdx.x;
|
|
||||||
const uint8_t * q = x[i].qs;
|
|
||||||
dst_t * y = yy + i*QK_K;
|
|
||||||
const float d = (float)x[i].dm[0];
|
|
||||||
const float m = (float)x[i].dm[1];
|
|
||||||
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
|
||||||
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
||||||
|
|
||||||
const int64_t i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
const int64_t il = tid/16; // il is in 0...3
|
const int64_t il = tid/16; // il is in 0...3
|
||||||
|
@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
||||||
hm <<= 1;
|
hm <<= 1;
|
||||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||||
#else
|
|
||||||
const int64_t tid = threadIdx.x;
|
|
||||||
const uint8_t q = x[i].qs[tid];
|
|
||||||
const int64_t im = tid/8; // 0...3
|
|
||||||
const int64_t in = tid%8; // 0...7
|
|
||||||
const int64_t is = tid/16; // 0 or 1
|
|
||||||
const uint8_t h = x[i].qh[in] >> im;
|
|
||||||
const float d = x[i].d;
|
|
||||||
dst_t * y = yy + i*QK_K + tid;
|
|
||||||
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
|
||||||
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int64_t i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
|
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
|
@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||||
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||||
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||||
#else
|
|
||||||
|
|
||||||
// assume 32 threads
|
|
||||||
const int64_t tid = threadIdx.x;
|
|
||||||
const int64_t ip = tid/16; // 0 or 1
|
|
||||||
const int64_t il = tid - 16*ip; // 0...15
|
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
|
||||||
|
|
||||||
const float d = x[i].d;
|
|
||||||
|
|
||||||
const uint8_t ql = x[i].ql[16*ip + il];
|
|
||||||
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
|
||||||
const int8_t * sc = x[i].scales;
|
|
||||||
|
|
||||||
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
|
||||||
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||||
|
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t il = tid/8; // 0...3
|
const int64_t il = tid/8; // 0...3
|
||||||
const int64_t ib = tid%8; // 0...7
|
const int64_t ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
||||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||||
|
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t il = tid/8; // 0...3
|
const int64_t il = tid/8; // 0...3
|
||||||
const int64_t ib = tid%8; // 0...7
|
const int64_t ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||||
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
||||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||||
|
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t il = tid/8; // 0...3
|
const int64_t il = tid/8; // 0...3
|
||||||
const int64_t ib = tid%8; // 0...7
|
const int64_t ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||||
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
||||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||||
|
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t il = tid/8; // 0...3
|
const int64_t il = tid/8; // 0...3
|
||||||
const int64_t ib = tid%8; // 0...7
|
const int64_t ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
||||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||||
|
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t il = tid/8; // 0...3
|
const int64_t il = tid/8; // 0...3
|
||||||
const int64_t ib = tid%8; // 0...7
|
const int64_t ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
||||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||||
const block_iq1_s * x = (const block_iq1_s *) vx;
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
||||||
|
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t il = tid/8; // 0...3
|
const int64_t il = tid/8; // 0...3
|
||||||
const int64_t ib = tid%8; // 0...7
|
const int64_t ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||||
for (int j = 0; j < 8; ++j) {
|
for (int j = 0; j < 8; ++j) {
|
||||||
y[j] = d * (q[j] + delta);
|
y[j] = d * (q[j] + delta);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||||
const block_iq1_m * x = (const block_iq1_m *) vx;
|
const block_iq1_m * x = (const block_iq1_m *) vx;
|
||||||
|
|
||||||
const int64_t tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
#if QK_K == 256
|
|
||||||
const int64_t il = tid/8; // 0...3
|
const int64_t il = tid/8; // 0...3
|
||||||
const int64_t ib = tid%8; // 0...7
|
const int64_t ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||||
for (int j = 0; j < 8; ++j) {
|
for (int j = 0; j < 8; ++j) {
|
||||||
y[j] = d * (q[j] + delta);
|
y[j] = d * (q[j] + delta);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||||
|
|
||||||
|
@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
||||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K != 64
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||||
const int64_t i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
|
@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
|
||||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
#else
|
|
||||||
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
|
||||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
#else
|
|
||||||
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
|
||||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
#else
|
|
||||||
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
|
||||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
#else
|
|
||||||
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
#if QK_K == 64
|
|
||||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
|
||||||
#else
|
|
||||||
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
|
|
|
@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||||
|
|
||||||
|
@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||||
tmp += dall * sum1 - dmin * sum2;
|
tmp += dall * sum1 - dmin * sum2;
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
|
||||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
|
||||||
const int offset = tid * K_QUANTS_PER_ITERATION;
|
|
||||||
|
|
||||||
uint32_t uaux[2];
|
|
||||||
const uint8_t * d = (const uint8_t *)uaux;
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
|
|
||||||
const float * y = yy + i * QK_K + offset;
|
|
||||||
const uint8_t * q = x[i].qs + offset;
|
|
||||||
const uint32_t * s = (const uint32_t *)x[i].scales;
|
|
||||||
|
|
||||||
uaux[0] = s[0] & 0x0f0f0f0f;
|
|
||||||
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
|
||||||
|
|
||||||
const float2 dall = __half22float2(x[i].dm);
|
|
||||||
|
|
||||||
float sum1 = 0, sum2 = 0;
|
|
||||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
||||||
const uint8_t ql = q[l];
|
|
||||||
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
|
||||||
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
|
||||||
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
|
||||||
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
|
||||||
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
|
||||||
}
|
|
||||||
tmp += dall.x * sum1 - dall.y * sum2;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
tmp = warp_reduce_sum(tmp);
|
tmp = warp_reduce_sum(tmp);
|
||||||
|
@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x0303;
|
const uint16_t kmask1 = 0x0303;
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
|
||||||
|
@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||||
tmp += d * sum;
|
tmp += d * sum;
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
|
|
||||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
|
||||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
|
||||||
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
|
||||||
const int in = offset/8; // 0 or 1
|
|
||||||
const int im = offset%8; // 0...7
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
|
|
||||||
const float * y = yy + i * QK_K + offset;
|
|
||||||
const uint8_t * q = x[i].qs + offset;
|
|
||||||
const uint8_t * s = x[i].scales;
|
|
||||||
|
|
||||||
const float dall = (float)x[i].d;
|
|
||||||
|
|
||||||
float sum = 0;
|
|
||||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
||||||
const uint8_t hl = x[i].hmask[im+l] >> in;
|
|
||||||
const uint8_t ql = q[l];
|
|
||||||
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
|
||||||
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
|
||||||
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
|
||||||
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
|
||||||
}
|
|
||||||
tmp += sum;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
tmp = warp_reduce_sum(tmp);
|
tmp = warp_reduce_sum(tmp);
|
||||||
|
@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||||
|
|
||||||
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
|
||||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
|
||||||
|
|
||||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
||||||
|
|
||||||
uint16_t aux16[2];
|
|
||||||
const uint8_t * s = (const uint8_t *)aux16;
|
|
||||||
|
|
||||||
float tmp = 0;
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
const uint8_t * q = x[i].qs + step;
|
|
||||||
const float * y = yy + i*QK_K + step;
|
|
||||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
||||||
aux16[0] = a[0] & 0x0f0f;
|
|
||||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
||||||
const float d = (float)x[i].dm[0];
|
|
||||||
const float m = (float)x[i].dm[1];
|
|
||||||
float sum = 0.f;
|
|
||||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
||||||
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
|
||||||
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
|
||||||
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
|
||||||
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
|
||||||
}
|
|
||||||
tmp += sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
tmp = warp_reduce_sum(tmp);
|
tmp = warp_reduce_sum(tmp);
|
||||||
|
@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
||||||
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
|
||||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
|
||||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
|
||||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
||||||
const int im = step/8;
|
|
||||||
const int in = step%8;
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
const uint8_t * q = x[i].qs + step;
|
|
||||||
const int8_t * s = x[i].scales;
|
|
||||||
const float * y = yy + i*QK_K + step;
|
|
||||||
const float d = x[i].d;
|
|
||||||
float sum = 0.f;
|
|
||||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
||||||
const uint8_t h = x[i].qh[in+j] >> im;
|
|
||||||
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
|
||||||
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
|
||||||
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
|
||||||
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
|
||||||
}
|
|
||||||
tmp += sum;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
tmp = warp_reduce_sum(tmp);
|
tmp = warp_reduce_sum(tmp);
|
||||||
|
|
||||||
|
@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||||
|
|
||||||
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
|
|
||||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||||
|
|
||||||
|
@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
|
|
||||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
|
|
||||||
|
|
||||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
|
|
||||||
const float * y = yy + i * QK_K + step;
|
|
||||||
const uint8_t * ql = x[i].ql + step;
|
|
||||||
const uint8_t * qh = x[i].qh + step;
|
|
||||||
const int8_t * s = x[i].scales;
|
|
||||||
|
|
||||||
const float d = x[i+0].d;
|
|
||||||
|
|
||||||
float sum = 0;
|
|
||||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
||||||
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
|
||||||
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
|
||||||
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
|
||||||
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
|
||||||
}
|
|
||||||
tmp += sum;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
tmp = warp_reduce_sum(tmp);
|
tmp = warp_reduce_sum(tmp);
|
||||||
|
|
||||||
|
|
|
@ -83,7 +83,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||||
const int i = i0 + threadIdx.x;
|
const int i = i0 + threadIdx.x;
|
||||||
|
|
||||||
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
|
||||||
Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -238,6 +238,10 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||||
|
|
||||||
|
if (ic0 + j_VKQ >= ne01) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
||||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||||
|
|
||||||
|
|
|
@ -79,7 +79,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
|
for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
|
||||||
float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x];
|
float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
|
||||||
Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
|
Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
|
||||||
Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
|
Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
|
||||||
}
|
}
|
||||||
|
@ -237,6 +237,10 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||||
|
|
||||||
|
if (ic0 + j_VKQ >= ne01) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
||||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||||
|
|
||||||
|
@ -283,12 +287,8 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * KQV = dst;
|
|
||||||
const ggml_tensor * Q = dst->src[0];
|
const ggml_tensor * Q = dst->src[0];
|
||||||
|
|
||||||
const int32_t precision = KQV->op_params[2];
|
|
||||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
|
||||||
|
|
||||||
if (Q->ne[1] <= 16) {
|
if (Q->ne[1] <= 16) {
|
||||||
constexpr int cols_per_block = 16;
|
constexpr int cols_per_block = 16;
|
||||||
constexpr int parallel_blocks = 4;
|
constexpr int parallel_blocks = 4;
|
||||||
|
|
|
@ -94,7 +94,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||||
const int i = i0 + threadIdx.x;
|
const int i = i0 + threadIdx.x;
|
||||||
|
|
||||||
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
|
||||||
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -212,6 +212,10 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||||
|
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||||
|
|
||||||
|
@ -223,7 +227,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parallel_blocks != 1 && tid < ncols) {
|
if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
|
||||||
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -91,7 +91,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||||
const int i = i0 + threadIdx.x;
|
const int i = i0 + threadIdx.x;
|
||||||
|
|
||||||
Q_h2[j][i0/WARP_SIZE] = Q_f2[j*(nb01/sizeof(float2)) + i];
|
Q_h2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
|
||||||
Q_h2[j][i0/WARP_SIZE].x *= scale;
|
Q_h2[j][i0/WARP_SIZE].x *= scale;
|
||||||
Q_h2[j][i0/WARP_SIZE].y *= scale;
|
Q_h2[j][i0/WARP_SIZE].y *= scale;
|
||||||
}
|
}
|
||||||
|
@ -200,6 +200,10 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||||
|
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||||
|
|
||||||
|
@ -211,7 +215,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parallel_blocks != 1 && tid < ncols) {
|
if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
|
||||||
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
1253
ggml-cuda/mmq.cu
1253
ggml-cuda/mmq.cu
File diff suppressed because it is too large
Load diff
|
@ -58,10 +58,10 @@ static __global__ void rope(
|
||||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, bool has_pos>
|
template<typename T, bool has_pos, bool has_freq_facs>
|
||||||
static __global__ void rope_neox(
|
static __global__ void rope_neox(
|
||||||
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
|
||||||
) {
|
) {
|
||||||
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||||
|
|
||||||
|
@ -88,7 +88,9 @@ static __global__ void rope_neox(
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
|
|
||||||
const int p = has_pos ? pos[i2] : 0;
|
const int p = has_pos ? pos[i2] : 0;
|
||||||
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
||||||
|
|
||||||
|
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
@ -164,7 +166,7 @@ static void rope_cuda(
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static void rope_neox_cuda(
|
static void rope_neox_cuda(
|
||||||
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(ncols % 2 == 0);
|
GGML_ASSERT(ncols % 2 == 0);
|
||||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||||
|
@ -175,16 +177,30 @@ static void rope_neox_cuda(
|
||||||
const float inv_ndims = -1.0f / n_dims;
|
const float inv_ndims = -1.0f / n_dims;
|
||||||
|
|
||||||
if (pos == nullptr) {
|
if (pos == nullptr) {
|
||||||
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
if (freq_factors == nullptr) {
|
||||||
|
rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims
|
theta_scale, inv_ndims, freq_factors
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims
|
theta_scale, inv_ndims, freq_factors
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (freq_factors == nullptr) {
|
||||||
|
rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
|
theta_scale, inv_ndims, freq_factors
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
|
theta_scale, inv_ndims, freq_factors
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rope_glm_f32_cuda(
|
static void rope_glm_f32_cuda(
|
||||||
|
@ -214,24 +230,27 @@ static void rope_cuda_f32(
|
||||||
|
|
||||||
static void rope_neox_cuda_f16(
|
static void rope_neox_cuda_f16(
|
||||||
const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
|
||||||
|
|
||||||
rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
|
rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rope_neox_cuda_f32(
|
static void rope_neox_cuda_f32(
|
||||||
const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
|
||||||
) {
|
) {
|
||||||
|
|
||||||
rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
|
rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const ggml_tensor * src1 = dst->src[1];
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
const ggml_tensor * src2 = dst->src[2];
|
||||||
|
|
||||||
const float * src0_d = (const float *)src0->data;
|
const float * src0_d = (const float *)src0->data;
|
||||||
const float * src1_d = (const float *)src1->data;
|
const float * src1_d = (const float *)src1->data;
|
||||||
|
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
@ -241,7 +260,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne2 = dst->ne[2];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
|
@ -259,16 +277,22 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
|
const float * freq_factors = nullptr;
|
||||||
const int32_t * pos = nullptr;
|
const int32_t * pos = nullptr;
|
||||||
if ((mode & 1) == 0) {
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
||||||
GGML_ASSERT(src1->ne[0] == ne2);
|
|
||||||
pos = (const int32_t *) src1_d;
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
pos = (const int32_t *) src1_d;
|
||||||
|
|
||||||
|
if (is_neox) {
|
||||||
|
if (src2 != nullptr) {
|
||||||
|
freq_factors = (const float *) src2->data;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
||||||
|
}
|
||||||
|
|
||||||
rope_corr_dims corr_dims;
|
rope_corr_dims corr_dims;
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
||||||
|
|
||||||
|
@ -280,12 +304,12 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
rope_neox_cuda_f32(
|
rope_neox_cuda_f32(
|
||||||
(const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
(const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
attn_factor, corr_dims, stream
|
attn_factor, corr_dims, freq_factors, stream
|
||||||
);
|
);
|
||||||
} else if (src0->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
rope_neox_cuda_f16(
|
rope_neox_cuda_f16(
|
||||||
(const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
(const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
attn_factor, corr_dims, stream
|
attn_factor, corr_dims, freq_factors, stream
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
|
|
@ -712,7 +712,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
||||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
|
|
||||||
#ifndef GGML_QKK_64
|
|
||||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||||
|
|
||||||
int v[2];
|
int v[2];
|
||||||
|
@ -754,58 +753,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
||||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
||||||
|
|
||||||
float sumf_d = 0.0f;
|
|
||||||
float sumf_m = 0.0f;
|
|
||||||
|
|
||||||
uint16_t aux16[2];
|
|
||||||
const uint8_t * s = (const uint8_t *)aux16;
|
|
||||||
|
|
||||||
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
|
||||||
aux16[0] = a[0] & 0x0f0f;
|
|
||||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
||||||
|
|
||||||
const float dall = bq4_K->dm[0];
|
|
||||||
const float dmin = bq4_K->dm[1];
|
|
||||||
|
|
||||||
const float d8_1 = __low2float(bq8_1[0].ds);
|
|
||||||
const float d8_2 = __low2float(bq8_1[1].ds);
|
|
||||||
|
|
||||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
|
||||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
|
||||||
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
|
||||||
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
|
||||||
|
|
||||||
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
|
||||||
const int v1 = q4[0];
|
|
||||||
const int v2 = q4[4];
|
|
||||||
|
|
||||||
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
|
||||||
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
|
||||||
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
|
||||||
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
|
||||||
|
|
||||||
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
|
||||||
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
|
||||||
|
|
||||||
return dall * sumf_d - dmin * sumf_m;
|
|
||||||
|
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
|
|
||||||
#ifndef GGML_QKK_64
|
|
||||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
||||||
|
|
||||||
int vl[2];
|
int vl[2];
|
||||||
|
@ -847,48 +799,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
||||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
|
||||||
|
|
||||||
const int8_t * s = bq5_K->scales;
|
|
||||||
|
|
||||||
const float d = bq5_K->d;
|
|
||||||
|
|
||||||
const float d8_1 = __low2half(bq8_1[0].ds);
|
|
||||||
const float d8_2 = __low2half(bq8_1[1].ds);
|
|
||||||
|
|
||||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
|
||||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
|
||||||
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
|
||||||
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
|
||||||
|
|
||||||
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
|
||||||
const int vl1 = ql[0];
|
|
||||||
const int vl2 = ql[4];
|
|
||||||
|
|
||||||
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
|
||||||
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
|
||||||
const int in = step%8; // 0, 4, 0, 4
|
|
||||||
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
|
||||||
|
|
||||||
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
|
||||||
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
|
||||||
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
|
||||||
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
|
||||||
|
|
||||||
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
|
||||||
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
|
||||||
|
|
||||||
return d * sumf_d;
|
|
||||||
|
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||||
|
@ -919,7 +829,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
||||||
|
|
||||||
#if QR2_XXS == 8
|
#if QR2_XXS == 8
|
||||||
|
@ -960,15 +869,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||||
}
|
}
|
||||||
return d * (sumi1 + sumi2);
|
return d * (sumi1 + sumi2);
|
||||||
#endif
|
#endif
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -1002,17 +907,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||||
GGML_UNUSED(ksigns64);
|
GGML_UNUSED(ksigns64);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif
|
#endif
|
||||||
#else
|
|
||||||
GGML_UNUSED(ksigns64);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -1048,16 +948,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
||||||
GGML_UNUSED(ksigns64);
|
GGML_UNUSED(ksigns64);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif
|
#endif
|
||||||
#else
|
|
||||||
GGML_UNUSED(ksigns64);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -1082,16 +977,12 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif
|
#endif
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: don't use lookup table for signs
|
// TODO: don't use lookup table for signs
|
||||||
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -1114,14 +1005,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif
|
#endif
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -1149,14 +1036,10 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||||
const float d = d1q * __low2float (bq8_1[ib32].ds);
|
const float d = d1q * __low2float (bq8_1[ib32].ds);
|
||||||
const float m = d1q * __high2float(bq8_1[ib32].ds);
|
const float m = d1q * __high2float(bq8_1[ib32].ds);
|
||||||
return d * sumi + m * delta;
|
return d * sumi + m * delta;
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -1192,9 +1075,6 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
||||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||||
const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
|
const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
|
||||||
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
|
@ -1250,9 +1130,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
||||||
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
||||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
|
|
||||||
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
||||||
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
||||||
|
|
||||||
|
@ -1270,10 +1148,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
||||||
sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
||||||
}
|
}
|
||||||
return d * (sumi1 + sumi2);
|
return d * (sumi1 + sumi2);
|
||||||
|
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1677,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
|
#pragma message("TODO: implement phi3 frequency factors support")
|
||||||
|
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
||||||
|
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
||||||
|
|
||||||
GGML_ASSERT(ne10 == ne02);
|
GGML_ASSERT(ne10 == ne02);
|
||||||
GGML_ASSERT(src0t == dstt);
|
GGML_ASSERT(src0t == dstt);
|
||||||
// const int n_past = ((int32_t *) dst->op_params)[0];
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
|
|
122
ggml-metal.m
122
ggml-metal.m
|
@ -381,10 +381,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
// dictionary of preprocessor macros
|
// dictionary of preprocessor macros
|
||||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||||
|
|
||||||
#ifdef GGML_QKK_64
|
|
||||||
prep[@"GGML_QKK_64"] = @(1);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
MTLCompileOptions* options = [MTLCompileOptions new];
|
MTLCompileOptions* options = [MTLCompileOptions new];
|
||||||
options.preprocessorMacros = prep;
|
options.preprocessorMacros = prep;
|
||||||
|
|
||||||
|
@ -927,12 +923,22 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
const int64_t ne10 = src1 ? src1->ne[0] : 0;
|
const int64_t ne10 = src1 ? src1->ne[0] : 0;
|
||||||
const int64_t ne11 = src1 ? src1->ne[1] : 0;
|
const int64_t ne11 = src1 ? src1->ne[1] : 0;
|
||||||
const int64_t ne12 = src1 ? src1->ne[2] : 0;
|
const int64_t ne12 = src1 ? src1->ne[2] : 0;
|
||||||
const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
const int64_t ne13 = src1 ? src1->ne[3] : 0;
|
||||||
|
|
||||||
const uint64_t nb10 = src1 ? src1->nb[0] : 0;
|
const uint64_t nb10 = src1 ? src1->nb[0] : 0;
|
||||||
const uint64_t nb11 = src1 ? src1->nb[1] : 0;
|
const uint64_t nb11 = src1 ? src1->nb[1] : 0;
|
||||||
const uint64_t nb12 = src1 ? src1->nb[2] : 0;
|
const uint64_t nb12 = src1 ? src1->nb[2] : 0;
|
||||||
const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
const uint64_t nb13 = src1 ? src1->nb[3] : 0;
|
||||||
|
|
||||||
|
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
||||||
|
const int64_t ne21 = src2 ? src2->ne[1] : 0;
|
||||||
|
const int64_t ne22 = src2 ? src2->ne[2] : 0; GGML_UNUSED(ne22);
|
||||||
|
const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
|
||||||
|
|
||||||
|
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
||||||
|
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
||||||
|
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
||||||
|
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
||||||
|
|
||||||
const int64_t ne0 = dst ? dst->ne[0] : 0;
|
const int64_t ne0 = dst ? dst->ne[0] : 0;
|
||||||
const int64_t ne1 = dst ? dst->ne[1] : 0;
|
const int64_t ne1 = dst ? dst->ne[1] : 0;
|
||||||
|
@ -1763,11 +1769,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
}
|
}
|
||||||
else if (src0t == GGML_TYPE_Q3_K) {
|
else if (src0t == GGML_TYPE_Q3_K) {
|
||||||
#ifdef GGML_QKK_64
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
||||||
#else
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
else if (src0t == GGML_TYPE_Q5_K) {
|
else if (src0t == GGML_TYPE_Q5_K) {
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
|
@ -1785,16 +1787,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
const int n_as = src0->ne[2];
|
const int n_as = src0->ne[2];
|
||||||
|
|
||||||
// src2 = ids
|
// src2 = ids
|
||||||
const int64_t ne20 = src2->ne[0];
|
|
||||||
const int64_t ne21 = src2->ne[1];
|
|
||||||
const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22);
|
|
||||||
const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23);
|
|
||||||
|
|
||||||
const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
|
|
||||||
const uint64_t nb21 = src2->nb[1];
|
|
||||||
const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
|
|
||||||
const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);
|
|
||||||
|
|
||||||
const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
|
const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
|
||||||
|
|
||||||
GGML_ASSERT(src2t == GGML_TYPE_I32);
|
GGML_ASSERT(src2t == GGML_TYPE_I32);
|
||||||
|
@ -2018,12 +2010,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
nth0 = 4;
|
nth0 = 4;
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
#if QK_K == 64
|
|
||||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
|
||||||
#else
|
|
||||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
||||||
#endif
|
|
||||||
|
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -2088,11 +2075,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
}
|
}
|
||||||
else if (src0t == GGML_TYPE_Q3_K) {
|
else if (src0t == GGML_TYPE_Q3_K) {
|
||||||
#ifdef GGML_QKK_64
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
|
||||||
#else
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
else if (src0t == GGML_TYPE_Q5_K) {
|
else if (src0t == GGML_TYPE_Q5_K) {
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||||
|
@ -2244,7 +2227,13 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
||||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||||
|
|
||||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
float freq_base;
|
||||||
|
float freq_scale;
|
||||||
|
float ext_factor;
|
||||||
|
float attn_factor;
|
||||||
|
float beta_fast;
|
||||||
|
float beta_slow;
|
||||||
|
|
||||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||||
|
@ -2252,6 +2241,15 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
|
const bool is_neox = mode & 2;
|
||||||
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal");
|
||||||
|
|
||||||
|
if (!is_neox) {
|
||||||
|
GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox");
|
||||||
|
}
|
||||||
|
|
||||||
id<MTLComputePipelineState> pipeline = nil;
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
|
@ -2263,33 +2261,38 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
if (id_src2 != nil) {
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
|
} else {
|
||||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
|
||||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
|
}
|
||||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:4];
|
||||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
|
||||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
|
||||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
|
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
|
||||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
|
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:8];
|
||||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:9];
|
||||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
|
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:10];
|
||||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
|
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:11];
|
||||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
|
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:12];
|
||||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
|
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:13];
|
||||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
|
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:14];
|
||||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:15];
|
||||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:16];
|
||||||
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:17];
|
||||||
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
|
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:18];
|
||||||
[encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
|
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:19];
|
||||||
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
|
[encoder setBytes:&n_past length:sizeof( int) atIndex:20];
|
||||||
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
|
[encoder setBytes:&n_dims length:sizeof( int) atIndex:21];
|
||||||
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
|
[encoder setBytes:&mode length:sizeof( int) atIndex:22];
|
||||||
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
|
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:23];
|
||||||
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
|
[encoder setBytes:&freq_base length:sizeof( float) atIndex:24];
|
||||||
|
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:25];
|
||||||
|
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:26];
|
||||||
|
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:27];
|
||||||
|
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:28];
|
||||||
|
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:29];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
@ -2535,11 +2538,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
||||||
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
||||||
|
|
||||||
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
|
||||||
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
|
||||||
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
|
||||||
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
|
||||||
|
|
||||||
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
||||||
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
||||||
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|
||||||
|
|
431
ggml-metal.metal
431
ggml-metal.metal
|
@ -1640,6 +1640,7 @@ static void rope_yarn_corr_dims(
|
||||||
typedef void (rope_t)(
|
typedef void (rope_t)(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int32_t * src1,
|
device const int32_t * src1,
|
||||||
|
device const float * src2,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
|
@ -1675,6 +1676,7 @@ template<typename T>
|
||||||
kernel void kernel_rope(
|
kernel void kernel_rope(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int32_t * src1,
|
device const int32_t * src1,
|
||||||
|
device const float * src2,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
|
@ -1744,8 +1746,10 @@ kernel void kernel_rope(
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||||
const float cur_rot = inv_ndims*ic - ib;
|
const float cur_rot = inv_ndims*ic - ib;
|
||||||
|
const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f;
|
||||||
|
|
||||||
|
const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor;
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, cur_rot);
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
|
@ -2204,11 +2208,7 @@ kernel void kernel_flash_attn_ext_f16(
|
||||||
// pointer to the mask
|
// pointer to the mask
|
||||||
device const half * mp = (device const half *) (mask + iq1*nb31);
|
device const half * mp = (device const half *) (mask + iq1*nb31);
|
||||||
|
|
||||||
// prepare diagonal scale matrix
|
float slope = 1.0f;
|
||||||
simdgroup_float8x8 mscale(scale);
|
|
||||||
|
|
||||||
// prepare diagonal slope matrix
|
|
||||||
simdgroup_float8x8 mslope(1.0f);
|
|
||||||
|
|
||||||
// ALiBi
|
// ALiBi
|
||||||
if (max_bias > 0.0f) {
|
if (max_bias > 0.0f) {
|
||||||
|
@ -2217,7 +2217,7 @@ kernel void kernel_flash_attn_ext_f16(
|
||||||
const float base = h < n_head_log2 ? m0 : m1;
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
mslope = simdgroup_float8x8(pow(base, exph));
|
slope = pow(base, exph);
|
||||||
}
|
}
|
||||||
|
|
||||||
// loop over the KV cache
|
// loop over the KV cache
|
||||||
|
@ -2242,18 +2242,20 @@ kernel void kernel_flash_attn_ext_f16(
|
||||||
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
|
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
|
||||||
|
|
||||||
|
const short tx = tiisg%4;
|
||||||
|
const short ty = tiisg/4;
|
||||||
|
|
||||||
if (mask != q) {
|
if (mask != q) {
|
||||||
// mqk = mqk*scale + mask*slope
|
// mqk = mqk*scale + mask*slope
|
||||||
simdgroup_half8x8 mm;
|
ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0];
|
||||||
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
|
ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1];
|
||||||
simdgroup_multiply(mm, mslope, mm);
|
|
||||||
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
|
|
||||||
} else {
|
} else {
|
||||||
// mqk = mqk*scale
|
// mqk = mqk*scale
|
||||||
simdgroup_multiply(mqk, mscale, mqk);
|
ss[8*cc + ty*TF + 2*tx + 0] *= scale;
|
||||||
|
ss[8*cc + ty*TF + 2*tx + 1] *= scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2816,8 +2818,7 @@ kernel void kernel_cpy_f32_f16(
|
||||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||||
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||||
|
|
||||||
// TODO: is there a better way to handle -INFINITY?
|
dst_data[i00] = src[0];
|
||||||
dst_data[i00] = src[0] == -INFINITY ? -MAXHALF : src[0];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3385,7 +3386,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||||
|
|
||||||
const int step = sizeof(block_q2_K) * nb;
|
const int step = sizeof(block_q2_K) * nb;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int ix = tiisg/8; // 0...3
|
const int ix = tiisg/8; // 0...3
|
||||||
const int it = tiisg%8; // 0...7
|
const int it = tiisg%8; // 0...7
|
||||||
const int iq = it/4; // 0 or 1
|
const int iq = it/4; // 0 or 1
|
||||||
|
@ -3437,57 +3437,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||||
|
|
||||||
y4 += 4 * QK_K;
|
y4 += 4 * QK_K;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
const int ix = tiisg/2; // 0...15
|
|
||||||
const int it = tiisg%2; // 0...1
|
|
||||||
|
|
||||||
device const float * y4 = y + ix * QK_K + 8 * it;
|
|
||||||
|
|
||||||
for (int ib = ix; ib < nb; ib += 16) {
|
|
||||||
|
|
||||||
float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
|
||||||
for (int i = 0; i < 8; ++i) {
|
|
||||||
yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
|
|
||||||
yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
|
|
||||||
yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
|
|
||||||
yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
|
|
||||||
}
|
|
||||||
|
|
||||||
device const uint8_t * sc = (device const uint8_t *)x[ib].scales;
|
|
||||||
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
|
||||||
device const half * dh = &x[ib].d;
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; row++) {
|
|
||||||
|
|
||||||
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
|
||||||
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
|
||||||
for (int i = 0; i < 8; i += 2) {
|
|
||||||
acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
|
|
||||||
acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
|
|
||||||
acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
|
|
||||||
acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
|
|
||||||
acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
|
|
||||||
acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
|
|
||||||
acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
|
|
||||||
acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
|
|
||||||
}
|
|
||||||
|
|
||||||
float dall = dh[0];
|
|
||||||
float dmin = dh[1];
|
|
||||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
|
|
||||||
(acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
|
|
||||||
(acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
|
|
||||||
(acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
|
|
||||||
dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
|
|
||||||
|
|
||||||
qs += step/2;
|
|
||||||
sc += step;
|
|
||||||
dh += step/2;
|
|
||||||
}
|
|
||||||
|
|
||||||
y4 += 16 * QK_K;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
all_sum = simd_sum(sumf[row]);
|
all_sum = simd_sum(sumf[row]);
|
||||||
|
@ -3525,7 +3474,6 @@ kernel void kernel_mul_mv_q2_K_f32(
|
||||||
kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
void kernel_mul_mv_q3_K_f32_impl(
|
void kernel_mul_mv_q3_K_f32_impl(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
|
@ -3684,84 +3632,6 @@ void kernel_mul_mv_q3_K_f32_impl(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
void kernel_mul_mv_q3_K_f32_impl(
|
|
||||||
device const void * src0,
|
|
||||||
device const float * src1,
|
|
||||||
device float * dst,
|
|
||||||
constant int64_t & ne00,
|
|
||||||
constant int64_t & ne01,
|
|
||||||
constant int64_t & ne02,
|
|
||||||
constant int64_t & ne10,
|
|
||||||
constant int64_t & ne12,
|
|
||||||
constant int64_t & ne0,
|
|
||||||
constant int64_t & ne1,
|
|
||||||
constant uint & r2,
|
|
||||||
constant uint & r3,
|
|
||||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
||||||
uint tiisg[[thread_index_in_simdgroup]],
|
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
||||||
|
|
||||||
const int nb = ne00/QK_K;
|
|
||||||
|
|
||||||
const int64_t r0 = tgpig.x;
|
|
||||||
const int64_t r1 = tgpig.y;
|
|
||||||
const int64_t im = tgpig.z;
|
|
||||||
|
|
||||||
const int row = 2 * r0 + sgitg;
|
|
||||||
|
|
||||||
const uint i12 = im%ne12;
|
|
||||||
const uint i13 = im/ne12;
|
|
||||||
|
|
||||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
||||||
|
|
||||||
device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
|
|
||||||
device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
||||||
|
|
||||||
const int ix = tiisg/4;
|
|
||||||
const int il = 4 * (tiisg%4);// 0, 4, 8, 12
|
|
||||||
const int iq = il/8; // 0, 0, 1, 1
|
|
||||||
const int in = il%8; // 0, 4, 0, 4
|
|
||||||
|
|
||||||
float2 sum = {0.f, 0.f};
|
|
||||||
|
|
||||||
for (int i = ix; i < nb; i += 8) {
|
|
||||||
|
|
||||||
const float d_all = (float)(x[i].d);
|
|
||||||
|
|
||||||
device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
|
|
||||||
device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
|
|
||||||
device const uint16_t * s = (device const uint16_t *)(x[i].scales);
|
|
||||||
device const float * y = yy + i * QK_K + il;
|
|
||||||
|
|
||||||
const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
|
|
||||||
const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
|
|
||||||
const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
|
|
||||||
const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
|
|
||||||
|
|
||||||
for (int l = 0; l < 4; l += 2) {
|
|
||||||
const uint16_t hm = h[l/2] >> iq;
|
|
||||||
sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4))
|
|
||||||
+ y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
|
|
||||||
+ y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
|
|
||||||
+ y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
|
|
||||||
sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
|
|
||||||
+ y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
|
|
||||||
+ y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
|
|
||||||
+ y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
const float sumf = sum[0] + sum[1] * 1.f/256.f;
|
|
||||||
|
|
||||||
const float tot = simd_sum(sumf);
|
|
||||||
if (tiisg == 0) {
|
|
||||||
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
[[host_name("kernel_mul_mv_q3_K_f32")]]
|
[[host_name("kernel_mul_mv_q3_K_f32")]]
|
||||||
kernel void kernel_mul_mv_q3_K_f32(
|
kernel void kernel_mul_mv_q3_K_f32(
|
||||||
|
@ -3791,7 +3661,6 @@ kernel void kernel_mul_mv_q3_K_f32(
|
||||||
kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
void kernel_mul_mv_q4_K_f32_impl(
|
void kernel_mul_mv_q4_K_f32_impl(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
|
@ -3905,103 +3774,6 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
void kernel_mul_mv_q4_K_f32_impl(
|
|
||||||
device const void * src0,
|
|
||||||
device const float * src1,
|
|
||||||
device float * dst,
|
|
||||||
constant int64_t & ne00,
|
|
||||||
constant int64_t & ne01,
|
|
||||||
constant int64_t & ne02,
|
|
||||||
constant int64_t & ne10,
|
|
||||||
constant int64_t & ne12,
|
|
||||||
constant int64_t & ne0,
|
|
||||||
constant int64_t & ne1,
|
|
||||||
constant uint & r2,
|
|
||||||
constant uint & r3,
|
|
||||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
||||||
uint tiisg[[thread_index_in_simdgroup]],
|
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
||||||
|
|
||||||
const int ix = tiisg/4; // 0...7
|
|
||||||
const int it = tiisg%4; // 0...3
|
|
||||||
|
|
||||||
const int nb = ne00/QK_K;
|
|
||||||
const int r0 = tgpig.x;
|
|
||||||
const int r1 = tgpig.y;
|
|
||||||
const int im = tgpig.z;
|
|
||||||
const int first_row = r0 * N_DST;
|
|
||||||
const int ib_row = first_row * nb;
|
|
||||||
|
|
||||||
const uint i12 = im%ne12;
|
|
||||||
const uint i13 = im/ne12;
|
|
||||||
|
|
||||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
|
||||||
|
|
||||||
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
|
||||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
|
||||||
|
|
||||||
float yl[8];
|
|
||||||
float yh[8];
|
|
||||||
float sumf[N_DST]={0.f}, all_sum;
|
|
||||||
|
|
||||||
const int step = sizeof(block_q4_K) * nb / 2;
|
|
||||||
|
|
||||||
device const float * y4 = y + ix * QK_K + 8 * it;
|
|
||||||
|
|
||||||
uint16_t sc16[4];
|
|
||||||
|
|
||||||
for (int ib = ix; ib < nb; ib += 8) {
|
|
||||||
|
|
||||||
float2 sumy = {0.f, 0.f};
|
|
||||||
for (int i = 0; i < 8; ++i) {
|
|
||||||
yl[i] = y4[i+ 0]; sumy[0] += yl[i];
|
|
||||||
yh[i] = y4[i+32]; sumy[1] += yh[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
|
|
||||||
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
|
||||||
device const half * dh = x[ib].d;
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; row++) {
|
|
||||||
|
|
||||||
sc16[0] = sc[0] & 0x000f;
|
|
||||||
sc16[1] = sc[0] & 0x0f00;
|
|
||||||
sc16[2] = sc[0] & 0x00f0;
|
|
||||||
sc16[3] = sc[0] & 0xf000;
|
|
||||||
|
|
||||||
float2 acc1 = {0.f, 0.f};
|
|
||||||
float2 acc2 = {0.f, 0.f};
|
|
||||||
for (int i = 0; i < 8; i += 2) {
|
|
||||||
acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
|
|
||||||
acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
|
|
||||||
acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
|
|
||||||
acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
|
|
||||||
}
|
|
||||||
|
|
||||||
float dall = dh[0];
|
|
||||||
float dmin = dh[1];
|
|
||||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
|
|
||||||
(acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
|
|
||||||
dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
|
|
||||||
|
|
||||||
qs += step;
|
|
||||||
sc += step;
|
|
||||||
dh += step;
|
|
||||||
}
|
|
||||||
|
|
||||||
y4 += 8 * QK_K;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
|
||||||
all_sum = simd_sum(sumf[row]);
|
|
||||||
if (tiisg == 0) {
|
|
||||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
[[host_name("kernel_mul_mv_q4_K_f32")]]
|
[[host_name("kernel_mul_mv_q4_K_f32")]]
|
||||||
kernel void kernel_mul_mv_q4_K_f32(
|
kernel void kernel_mul_mv_q4_K_f32(
|
||||||
|
@ -4069,8 +3841,6 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||||
|
|
||||||
const int step = sizeof(block_q5_K) * nb;
|
const int step = sizeof(block_q5_K) * nb;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
#
|
|
||||||
float yl[16], yh[16];
|
float yl[16], yh[16];
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
|
@ -4153,54 +3923,6 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||||
y1 += 4 * QK_K;
|
y1 += 4 * QK_K;
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
float yl[8], yh[8];
|
|
||||||
|
|
||||||
const int il = 4 * (tiisg/8); // 0, 4, 8, 12
|
|
||||||
const int ix = tiisg%8;
|
|
||||||
const int iq = il/8; // 0, 0, 1, 1
|
|
||||||
const int in = il%8; // 0, 4, 0, 4
|
|
||||||
|
|
||||||
device const float * y = yy + ix*QK_K + il;
|
|
||||||
|
|
||||||
for (int i = ix; i < nb; i += 8) {
|
|
||||||
|
|
||||||
for (int l = 0; l < 4; ++l) {
|
|
||||||
yl[l+0] = y[l+ 0];
|
|
||||||
yl[l+4] = y[l+16];
|
|
||||||
yh[l+0] = y[l+32];
|
|
||||||
yh[l+4] = y[l+48];
|
|
||||||
}
|
|
||||||
|
|
||||||
device const half * dh = &x[i].d;
|
|
||||||
device const uint8_t * q = x[i].qs + il;
|
|
||||||
device const uint8_t * h = x[i].qh + in;
|
|
||||||
device const int8_t * s = x[i].scales;
|
|
||||||
|
|
||||||
for (int row = 0; row < 2; ++row) {
|
|
||||||
|
|
||||||
const float d = dh[0];
|
|
||||||
|
|
||||||
float2 acc = {0.f, 0.f};
|
|
||||||
for (int l = 0; l < 4; ++l) {
|
|
||||||
const uint8_t hl = h[l] >> iq;
|
|
||||||
acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
|
|
||||||
+ yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
|
|
||||||
acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
|
|
||||||
+ yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
|
|
||||||
}
|
|
||||||
sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
|
|
||||||
|
|
||||||
q += step;
|
|
||||||
h += step;
|
|
||||||
s += step;
|
|
||||||
dh += step/2;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
y += 8 * QK_K;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int row = 0; row < 2; ++row) {
|
for (int row = 0; row < 2; ++row) {
|
||||||
const float tot = simd_sum(sumf[row]);
|
const float tot = simd_sum(sumf[row]);
|
||||||
|
@ -4279,7 +4001,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int tid = tiisg/2;
|
const int tid = tiisg/2;
|
||||||
const int ix = tiisg%2;
|
const int ix = tiisg%2;
|
||||||
const int ip = tid/8; // 0 or 1
|
const int ip = tid/8; // 0 or 1
|
||||||
|
@ -4315,30 +4036,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
|
||||||
const int ix = tiisg/4;
|
|
||||||
const int il = 4*(tiisg%4);
|
|
||||||
|
|
||||||
for (int i = ix; i < nb; i += 8) {
|
|
||||||
device const float * y = yy + i * QK_K + il;
|
|
||||||
device const uint8_t * ql = x[i].ql + il;
|
|
||||||
device const uint8_t * qh = x[i].qh + il;
|
|
||||||
device const int8_t * s = x[i].scales;
|
|
||||||
|
|
||||||
const float d = x[i].d;
|
|
||||||
|
|
||||||
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
|
||||||
for (int l = 0; l < 4; ++l) {
|
|
||||||
sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
|
|
||||||
sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
|
|
||||||
sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
|
|
||||||
sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
|
|
||||||
}
|
|
||||||
sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const float tot = simd_sum(sumf);
|
const float tot = simd_sum(sumf);
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
||||||
|
@ -5172,9 +4869,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||||
|
|
||||||
device const float * y4 = y + 32 * ix;
|
device const float * y4 = y + 32 * ix;
|
||||||
|
|
||||||
#if QK_K != 64
|
|
||||||
iq1m_scale_t scale;
|
iq1m_scale_t scale;
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||||
|
|
||||||
|
@ -5195,10 +4890,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||||
device const uint16_t * sc = (device const uint16_t *)xr->scales;
|
device const uint16_t * sc = (device const uint16_t *)xr->scales;
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; row++) {
|
for (int row = 0; row < N_DST; row++) {
|
||||||
|
|
||||||
#if QK_K != 64
|
|
||||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||||
#endif
|
|
||||||
|
|
||||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
||||||
|
@ -5214,14 +4906,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||||
}
|
}
|
||||||
const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||||
const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||||
#if QK_K == 64
|
|
||||||
const float d = (float) *((device const half *)(sc - 1));
|
|
||||||
sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
|
|
||||||
(sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
|
|
||||||
#else
|
|
||||||
sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
|
sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
|
||||||
(sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
|
(sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
|
||||||
#endif
|
|
||||||
|
|
||||||
sc += nb*sizeof(block_iq1_m)/2;
|
sc += nb*sizeof(block_iq1_m)/2;
|
||||||
qs += nb*sizeof(block_iq1_m);
|
qs += nb*sizeof(block_iq1_m);
|
||||||
|
@ -5333,7 +5020,6 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K != 64
|
|
||||||
void kernel_mul_mv_iq4_xs_f32_impl(
|
void kernel_mul_mv_iq4_xs_f32_impl(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
|
@ -5428,7 +5114,6 @@ void kernel_mul_mv_iq4_xs_f32_impl(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
||||||
kernel void kernel_mul_mv_iq1_s_f32(
|
kernel void kernel_mul_mv_iq1_s_f32(
|
||||||
|
@ -5541,11 +5226,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
|
||||||
uint tiisg[[thread_index_in_simdgroup]],
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
|
||||||
#if QK_K == 64
|
|
||||||
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
|
||||||
#else
|
|
||||||
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//============================= templates and their specializations =============================
|
//============================= templates and their specializations =============================
|
||||||
|
@ -5671,10 +5352,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
|
||||||
float dl, ml;
|
float dl, ml;
|
||||||
uint8_t sc = xb->scales[il];
|
uint8_t sc = xb->scales[il];
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
q = q + 32*(il/8) + 16*(il&1);
|
q = q + 32*(il/8) + 16*(il&1);
|
||||||
il = (il/2)%4;
|
il = (il/2)%4;
|
||||||
#endif
|
|
||||||
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||||
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||||
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
||||||
|
@ -5690,7 +5370,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||||
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
||||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
q = q + 32 * (il/8) + 16 * (il&1);
|
q = q + 32 * (il/8) + 16 * (il&1);
|
||||||
h = h + 16 * (il&1);
|
h = h + 16 * (il&1);
|
||||||
uint8_t m = 1 << (il/2);
|
uint8_t m = 1 << (il/2);
|
||||||
|
@ -5711,17 +5390,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
|
||||||
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
|
||||||
float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
|
|
||||||
float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
|
||||||
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
|
||||||
uint8_t m = 1<<(il*2);
|
|
||||||
for (int i = 0; i < 16; ++i) {
|
|
||||||
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
||||||
|
@ -5733,7 +5401,6 @@ template <typename type4x4>
|
||||||
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
||||||
device const uchar * q = xb->qs;
|
device const uchar * q = xb->qs;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
short is = (il/4) * 2;
|
short is = (il/4) * 2;
|
||||||
q = q + (il/4) * 32 + 16 * (il&1);
|
q = q + (il/4) * 32 + 16 * (il&1);
|
||||||
il = il & 3;
|
il = il & 3;
|
||||||
|
@ -5742,16 +5409,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
||||||
const float min = xb->dmin;
|
const float min = xb->dmin;
|
||||||
const float dl = d * sc[0];
|
const float dl = d * sc[0];
|
||||||
const float ml = min * sc[1];
|
const float ml = min * sc[1];
|
||||||
#else
|
|
||||||
(void) get_scale_min_k4_just2;
|
|
||||||
|
|
||||||
q = q + 16 * (il&1);
|
|
||||||
device const uint8_t * s = xb->scales;
|
|
||||||
device const half2 * dh = (device const half2 *)xb->d;
|
|
||||||
const float2 d = (float2)dh[0];
|
|
||||||
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
|
|
||||||
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
|
|
||||||
#endif
|
|
||||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||||
|
@ -5763,7 +5421,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
||||||
device const uint8_t * q = xb->qs;
|
device const uint8_t * q = xb->qs;
|
||||||
device const uint8_t * qh = xb->qh;
|
device const uint8_t * qh = xb->qh;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
short is = (il/4) * 2;
|
short is = (il/4) * 2;
|
||||||
q = q + 32 * (il/4) + 16 * (il&1);
|
q = q + 32 * (il/4) + 16 * (il&1);
|
||||||
qh = qh + 16 * (il&1);
|
qh = qh + 16 * (il&1);
|
||||||
|
@ -5780,17 +5437,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
q = q + 16 * (il&1);
|
|
||||||
device const int8_t * s = xb->scales;
|
|
||||||
const float dl = xb->d * s[il];
|
|
||||||
uint8_t m = 1<<(il*2);
|
|
||||||
const float coef = il<2 ? 1.f : 1.f/16.f;
|
|
||||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
|
||||||
for (int i = 0; i < 16; ++i) {
|
|
||||||
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
|
@ -5800,15 +5446,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
|
||||||
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
||||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||||
qh = qh + 32*(il/8) + 16*(il&1);
|
qh = qh + 32*(il/8) + 16*(il&1);
|
||||||
float sc = scales[(il%2) + 2 * ((il/2))];
|
float sc = scales[(il%2) + 2 * ((il/2))];
|
||||||
il = (il/2) & 3;
|
il = (il/2) & 3;
|
||||||
#else
|
|
||||||
ql = ql + 16 * (il&1);
|
|
||||||
float sc = scales[il];
|
|
||||||
#endif
|
|
||||||
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||||
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
||||||
const float coef = il>1 ? 1.f/16.f : 1.f;
|
const float coef = il>1 ? 1.f/16.f : 1.f;
|
||||||
|
@ -5965,20 +5607,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 &
|
||||||
const int ib32 = il/2;
|
const int ib32 = il/2;
|
||||||
il = il%2;
|
il = il%2;
|
||||||
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
||||||
#if QK_K == 64
|
|
||||||
const float d = xb->d;
|
|
||||||
#else
|
|
||||||
iq1m_scale_t scale;
|
iq1m_scale_t scale;
|
||||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||||
const float d = scale.f16;
|
const float d = scale.f16;
|
||||||
#endif
|
|
||||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||||
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
||||||
#if QK_K == 64
|
|
||||||
const float dl = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
|
|
||||||
#else
|
|
||||||
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
||||||
#endif
|
|
||||||
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||||
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||||
|
@ -6008,9 +5645,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
|
||||||
|
|
||||||
template <typename type4x4>
|
template <typename type4x4>
|
||||||
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
||||||
#if QK_K == 64
|
|
||||||
dequantize_iq4_nl(xb, il, reg);
|
|
||||||
#else
|
|
||||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||||
const int ib32 = il/2;
|
const int ib32 = il/2;
|
||||||
il = il%2;
|
il = il%2;
|
||||||
|
@ -6027,7 +5661,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
|
||||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||||
|
@ -6532,11 +6165,7 @@ kernel void kernel_mul_mm_id(
|
||||||
sgitg);
|
sgitg);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
#define QK_NL 16
|
#define QK_NL 16
|
||||||
#else
|
|
||||||
#define QK_NL 4
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// get rows
|
// get rows
|
||||||
|
@ -6576,11 +6205,7 @@ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_r
|
||||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||||
#if QK_K == 64
|
|
||||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
|
|
||||||
#else
|
|
||||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// matrix-matrix multiplication
|
// matrix-matrix multiplication
|
||||||
|
@ -6608,11 +6233,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_m
|
||||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||||
#if QK_K == 64
|
|
||||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
|
|
||||||
#else
|
|
||||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// indirect matrix-matrix multiplication
|
// indirect matrix-matrix multiplication
|
||||||
|
@ -6640,11 +6261,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel
|
||||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||||
template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||||
#if QK_K == 64
|
|
||||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
|
|
||||||
#else
|
|
||||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||||
#endif
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// matrix-vector multiplication
|
// matrix-vector multiplication
|
||||||
|
@ -6853,7 +6470,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t
|
||||||
template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
|
template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
|
||||||
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
||||||
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
||||||
#if QK_K != 64
|
|
||||||
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
|
2672
ggml-quants.c
2672
ggml-quants.c
File diff suppressed because it is too large
Load diff
475
ggml-sycl.cpp
475
ggml-sycl.cpp
|
@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
const block_q2_K * x = (const block_q2_K *) vx;
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
#if QK_K == 256
|
|
||||||
const int n = tid/32;
|
const int n = tid/32;
|
||||||
const int l = tid - 32*n;
|
const int l = tid - 32*n;
|
||||||
const int is = 8*n + l/16;
|
const int is = 8*n + l/16;
|
||||||
|
@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||||
#else
|
|
||||||
const int is = tid/16; // 0 or 1
|
|
||||||
const int il = tid%16; // 0...15
|
|
||||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
|
||||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
|
||||||
|
|
||||||
float dall = x[i].dm[0];
|
|
||||||
float dmin = x[i].dm[1];
|
|
||||||
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
|
||||||
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
const int i = item_ct1.get_group(2);
|
const int i = item_ct1.get_group(2);
|
||||||
const block_q3_K * x = (const block_q3_K *) vx;
|
const block_q3_K * x = (const block_q3_K *) vx;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int r = item_ct1.get_local_id(2) / 4;
|
const int r = item_ct1.get_local_id(2) / 4;
|
||||||
const int tid = r/2;
|
const int tid = r/2;
|
||||||
const int is0 = r%2;
|
const int is0 = r%2;
|
||||||
|
@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
const uint8_t * hm = x[i].hmask;
|
const uint8_t * hm = x[i].hmask;
|
||||||
|
|
||||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||||
#else
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
|
||||||
const int is = tid/16; // 0 or 1
|
|
||||||
const int il = tid%16; // 0...15
|
|
||||||
const int im = il/8; // 0...1
|
|
||||||
const int in = il%8; // 0...7
|
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
|
||||||
|
|
||||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
|
||||||
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
|
||||||
const float d = (float)x[i].d;
|
|
||||||
|
|
||||||
if (is == 0) {
|
|
||||||
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
|
||||||
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
|
||||||
} else {
|
|
||||||
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
|
||||||
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||||
if (j < 4) {
|
if (j < 4) {
|
||||||
d = q[j] & 63; m = q[j + 4] & 63;
|
d = q[j] & 63; m = q[j + 4] & 63;
|
||||||
|
@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
||||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||||
|
@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
|
|
||||||
const int i = item_ct1.get_group(2);
|
const int i = item_ct1.get_group(2);
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
const int il = tid/8;
|
const int il = tid/8;
|
||||||
|
@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
|
||||||
const uint8_t * q = x[i].qs;
|
|
||||||
dst_t * y = yy + i*QK_K;
|
|
||||||
const float d = (float)x[i].dm[0];
|
|
||||||
const float m = (float)x[i].dm[1];
|
|
||||||
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
|
||||||
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
|
|
||||||
const int i = item_ct1.get_group(2);
|
const int i = item_ct1.get_group(2);
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
const int il = tid/16; // il is in 0...3
|
const int il = tid/16; // il is in 0...3
|
||||||
|
@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
hm <<= 1;
|
hm <<= 1;
|
||||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||||
#else
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
|
||||||
const uint8_t q = x[i].qs[tid];
|
|
||||||
const int im = tid/8; // 0...3
|
|
||||||
const int in = tid%8; // 0...7
|
|
||||||
const int is = tid/16; // 0 or 1
|
|
||||||
const uint8_t h = x[i].qh[in] >> im;
|
|
||||||
const float d = x[i].d;
|
|
||||||
dst_t * y = yy + i*QK_K + tid;
|
|
||||||
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
|
||||||
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int i = item_ct1.get_group(2);
|
const int i = item_ct1.get_group(2);
|
||||||
#if QK_K == 256
|
|
||||||
|
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
|
@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
|
||||||
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||||
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||||
#else
|
|
||||||
|
|
||||||
// assume 32 threads
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
|
||||||
const int ip = tid/16; // 0 or 1
|
|
||||||
const int il = tid - 16*ip; // 0...15
|
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
|
||||||
|
|
||||||
const float d = x[i].d;
|
|
||||||
|
|
||||||
const uint8_t ql = x[i].ql[16*ip + il];
|
|
||||||
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
|
||||||
const int8_t * sc = x[i].scales;
|
|
||||||
|
|
||||||
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
|
||||||
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
||||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
#if QK_K == 256
|
|
||||||
const int il = tid/8; // 0...3
|
const int il = tid/8; // 0...3
|
||||||
const int ib = tid%8; // 0...7
|
const int ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
|
||||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
||||||
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
|
||||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
||||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
#if QK_K == 256
|
|
||||||
const int il = tid/8; // 0...3
|
const int il = tid/8; // 0...3
|
||||||
const int ib = tid%8; // 0...7
|
const int ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
|
||||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||||
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
||||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
|
@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
#if QK_K == 256
|
|
||||||
const int il = tid/8; // 0...3
|
const int il = tid/8; // 0...3
|
||||||
const int ib = tid%8; // 0...7
|
const int ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||||
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < 8; ++j)
|
for (int j = 0; j < 8; ++j) {
|
||||||
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||||
#else
|
}
|
||||||
assert(false);
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
|
@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
||||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
#if QK_K == 256
|
|
||||||
const int il = tid/8; // 0...3
|
const int il = tid/8; // 0...3
|
||||||
const int ib = tid%8; // 0...7
|
const int ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
|
||||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
|
@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
#if QK_K == 256
|
|
||||||
const int il = tid/8; // 0...3
|
const int il = tid/8; // 0...3
|
||||||
const int ib = tid%8; // 0...7
|
const int ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
|
@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
const block_iq1_s * x = (const block_iq1_s *) vx;
|
const block_iq1_s * x = (const block_iq1_s *) vx;
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
#if QK_K == 256
|
|
||||||
const int il = tid/8; // 0...3
|
const int il = tid/8; // 0...3
|
||||||
const int ib = tid%8; // 0...7
|
const int ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
for (int j = 0; j < 8; ++j) {
|
for (int j = 0; j < 8; ++j) {
|
||||||
y[j] = d * (q[j] + delta);
|
y[j] = d * (q[j] + delta);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
|
@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
const block_iq1_m * x = (const block_iq1_m *) vx;
|
const block_iq1_m * x = (const block_iq1_m *) vx;
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
const int tid = item_ct1.get_local_id(2);
|
||||||
#if QK_K == 256
|
|
||||||
const int il = tid/8; // 0...3
|
const int il = tid/8; // 0...3
|
||||||
const int ib = tid%8; // 0...7
|
const int ib = tid%8; // 0...7
|
||||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||||
|
@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
|
||||||
for (int j = 0; j < 8; ++j) {
|
for (int j = 0; j < 8; ++j) {
|
||||||
y[j] = d * (q[j] + delta);
|
y[j] = d * (q[j] + delta);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
|
@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int tid =
|
const int tid =
|
||||||
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||||
const int ix =
|
const int ix =
|
||||||
|
@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||||
tmp += dall * sum1 - dmin * sum2;
|
tmp += dall * sum1 - dmin * sum2;
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
const int tid = item_ct1.get_local_id(2) /
|
|
||||||
(2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
|
||||||
const int ix = item_ct1.get_local_id(2) %
|
|
||||||
(2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
|
||||||
const int offset = tid * K_QUANTS_PER_ITERATION;
|
|
||||||
|
|
||||||
uint32_t uaux[2];
|
|
||||||
const uint8_t * d = (const uint8_t *)uaux;
|
|
||||||
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
|
|
||||||
const float * y = yy + i * QK_K + offset;
|
|
||||||
const uint8_t * q = x[i].qs + offset;
|
|
||||||
const uint32_t * s = (const uint32_t *)x[i].scales;
|
|
||||||
|
|
||||||
uaux[0] = s[0] & 0x0f0f0f0f;
|
|
||||||
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
|
||||||
|
|
||||||
const sycl::float2 dall =
|
|
||||||
x[i].dm.convert<float, sycl::rounding_mode::automatic>();
|
|
||||||
|
|
||||||
float sum1 = 0, sum2 = 0;
|
|
||||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
||||||
const uint8_t ql = q[l];
|
|
||||||
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
|
||||||
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
|
||||||
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
|
||||||
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
|
||||||
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
|
||||||
}
|
|
||||||
tmp += dall.x() * sum1 - dall.y() * sum2;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x0303;
|
const uint16_t kmask1 = 0x0303;
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
|
||||||
|
@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||||
tmp += d * sum;
|
tmp += d * sum;
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
|
||||||
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
|
||||||
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
|
||||||
const int in = offset/8; // 0 or 1
|
|
||||||
const int im = offset%8; // 0...7
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
|
|
||||||
const float * y = yy + i * QK_K + offset;
|
|
||||||
const uint8_t * q = x[i].qs + offset;
|
|
||||||
const uint8_t * s = x[i].scales;
|
|
||||||
|
|
||||||
const float dall = (float)x[i].d;
|
|
||||||
|
|
||||||
float sum = 0;
|
|
||||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
|
||||||
const uint8_t hl = x[i].hmask[im+l] >> in;
|
|
||||||
const uint8_t ql = q[l];
|
|
||||||
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
|
||||||
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
|
||||||
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
|
||||||
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
|
||||||
}
|
|
||||||
tmp += sum;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
|
||||||
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
|
||||||
|
|
||||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
||||||
|
|
||||||
uint16_t aux16[2];
|
|
||||||
const uint8_t * s = (const uint8_t *)aux16;
|
|
||||||
|
|
||||||
float tmp = 0;
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
const uint8_t * q = x[i].qs + step;
|
|
||||||
const float * y = yy + i*QK_K + step;
|
|
||||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
|
||||||
aux16[0] = a[0] & 0x0f0f;
|
|
||||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
||||||
const float d = (float)x[i].dm[0];
|
|
||||||
const float m = (float)x[i].dm[1];
|
|
||||||
float sum = 0.f;
|
|
||||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
||||||
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
|
||||||
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
|
||||||
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
|
||||||
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
|
||||||
}
|
|
||||||
tmp += sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||||
dmin * smin;
|
dmin * smin;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
|
||||||
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15
|
|
||||||
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);
|
|
||||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
||||||
const int im = step/8;
|
|
||||||
const int in = step%8;
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
const uint8_t * q = x[i].qs + step;
|
|
||||||
const int8_t * s = x[i].scales;
|
|
||||||
const float * y = yy + i*QK_K + step;
|
|
||||||
const float d = x[i].d;
|
|
||||||
float sum = 0.f;
|
|
||||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
||||||
const uint8_t h = x[i].qh[in+j] >> im;
|
|
||||||
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
|
||||||
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
|
||||||
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
|
||||||
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
|
||||||
}
|
|
||||||
tmp += sum;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||||
|
|
||||||
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
|
|
||||||
const int tid =
|
const int tid =
|
||||||
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||||
const int ix =
|
const int ix =
|
||||||
|
@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7
|
|
||||||
const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3
|
|
||||||
|
|
||||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
|
||||||
|
|
||||||
const float * y = yy + i * QK_K + step;
|
|
||||||
const uint8_t * ql = x[i].ql + step;
|
|
||||||
const uint8_t * qh = x[i].qh + step;
|
|
||||||
const int8_t * s = x[i].scales;
|
|
||||||
|
|
||||||
const float d = x[i+0].d;
|
|
||||||
|
|
||||||
float sum = 0;
|
|
||||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
|
||||||
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
|
||||||
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
|
||||||
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
|
||||||
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
|
||||||
}
|
|
||||||
tmp += sum;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
@ -6857,7 +6586,6 @@ static __dpct_inline__ float
|
||||||
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||||
|
|
||||||
#ifndef GGML_QKK_64
|
|
||||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||||
|
|
||||||
int v[2];
|
int v[2];
|
||||||
|
@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
|
||||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
||||||
|
|
||||||
float sumf_d = 0.0f;
|
|
||||||
float sumf_m = 0.0f;
|
|
||||||
|
|
||||||
uint16_t aux16[2];
|
|
||||||
const uint8_t * s = (const uint8_t *)aux16;
|
|
||||||
|
|
||||||
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
|
||||||
aux16[0] = a[0] & 0x0f0f;
|
|
||||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
||||||
|
|
||||||
const float dall = bq4_K->dm[0];
|
|
||||||
const float dmin = bq4_K->dm[1];
|
|
||||||
|
|
||||||
const float d8_1 = bq8_1[0].ds[0];
|
|
||||||
const float d8_2 = bq8_1[1].ds[1];
|
|
||||||
|
|
||||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
|
||||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
|
||||||
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
|
||||||
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
|
||||||
|
|
||||||
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
|
||||||
const int v1 = q4[0];
|
|
||||||
const int v2 = q4[4];
|
|
||||||
|
|
||||||
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
|
||||||
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
|
||||||
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
|
|
||||||
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
|
|
||||||
|
|
||||||
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
|
||||||
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
|
||||||
|
|
||||||
return dall * sumf_d - dmin * sumf_m;
|
|
||||||
|
|
||||||
#else
|
|
||||||
bad_arch();
|
|
||||||
#endif // __SYCL_ARCH__ >= VER_4VEC
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y>
|
template <int mmq_y>
|
||||||
|
@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
||||||
|
|
||||||
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
||||||
#else
|
|
||||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -7050,7 +6728,6 @@ static __dpct_inline__ float
|
||||||
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||||
|
|
||||||
#ifndef GGML_QKK_64
|
|
||||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
||||||
|
|
||||||
int vl[2];
|
int vl[2];
|
||||||
|
@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
|
|
||||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
|
||||||
|
|
||||||
const int8_t * s = bq5_K->scales;
|
|
||||||
|
|
||||||
const float d = bq5_K->d;
|
|
||||||
|
|
||||||
const float d8_1 = bq8_1[0].ds[0];
|
|
||||||
const float d8_2 = bq8_1[1].ds[1];
|
|
||||||
|
|
||||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
|
||||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
|
||||||
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
|
||||||
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
|
||||||
|
|
||||||
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
|
||||||
const int vl1 = ql[0];
|
|
||||||
const int vl2 = ql[4];
|
|
||||||
|
|
||||||
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
|
||||||
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
|
||||||
const int in = step%8; // 0, 4, 0, 4
|
|
||||||
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
|
||||||
|
|
||||||
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
|
||||||
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
|
||||||
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
|
||||||
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
|
||||||
|
|
||||||
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
|
|
||||||
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
|
|
||||||
|
|
||||||
return d * sumf_d;
|
|
||||||
|
|
||||||
#else
|
|
||||||
bad_arch();
|
|
||||||
#endif // __SYCL_ARCH__ >= VER_4VEC
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y>
|
template <int mmq_y>
|
||||||
|
@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
||||||
|
|
||||||
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
||||||
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
|
||||||
const uint8_t *kmask_iq2xs) {
|
const uint8_t *kmask_iq2xs) {
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
||||||
|
|
||||||
#if QR2_XXS == 8
|
#if QR2_XXS == 8
|
||||||
|
@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
||||||
}
|
}
|
||||||
return d * (sumi1 + sumi2);
|
return d * (sumi1 + sumi2);
|
||||||
#endif
|
#endif
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
return 0.f;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __dpct_inline__ float
|
static __dpct_inline__ float
|
||||||
|
@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
||||||
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
|
||||||
#if DPCT_COMPATIBILITY_TEMP >= \
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
||||||
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
|
||||||
assert(false);
|
assert(false);
|
||||||
return 0.f;
|
return 0.f;
|
||||||
#endif
|
#endif
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
return 0.f;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __dpct_inline__ float
|
static __dpct_inline__ float
|
||||||
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
|
||||||
}
|
}
|
||||||
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
|
||||||
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __dpct_inline__ float
|
static __dpct_inline__ float
|
||||||
|
@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
||||||
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
|
||||||
#if DPCT_COMPATIBILITY_TEMP >= \
|
#if DPCT_COMPATIBILITY_TEMP >= \
|
||||||
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
|
||||||
assert(false);
|
assert(false);
|
||||||
return 0.f;
|
return 0.f;
|
||||||
#endif
|
#endif
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
return 0.f;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __dpct_inline__ float
|
static __dpct_inline__ float
|
||||||
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
||||||
const uint32_t *iq3s_grid) {
|
const uint32_t *iq3s_grid) {
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
||||||
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
(1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
|
||||||
bq8_1[ib32].ds[0];
|
bq8_1[ib32].ds[0];
|
||||||
return d * sumi;
|
return d * sumi;
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __dpct_inline__ float
|
static __dpct_inline__ float
|
||||||
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
||||||
const uint32_t *iq1s_grid_gpu) {
|
const uint32_t *iq1s_grid_gpu) {
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
||||||
const float d = d1q * bq8_1[ib32].ds[0];
|
const float d = d1q * bq8_1[ib32].ds[0];
|
||||||
const float m = d1q * bq8_1[ib32].ds[1];
|
const float m = d1q * bq8_1[ib32].ds[1];
|
||||||
return d * sumi + m * delta;
|
return d * sumi + m * delta;
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __dpct_inline__ float
|
static __dpct_inline__ float
|
||||||
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
||||||
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
|
@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
|
||||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||||
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
|
||||||
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
|
||||||
|
@ -7720,7 +7322,6 @@ static __dpct_inline__ float
|
||||||
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
||||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
||||||
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
||||||
|
|
||||||
|
@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
|
||||||
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
|
||||||
}
|
}
|
||||||
return d * (sumi1 + sumi2);
|
return d * (sumi1 + sumi2);
|
||||||
#else
|
|
||||||
assert(false);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
|
||||||
|
@ -10203,7 +9801,6 @@ template <typename dst_t>
|
||||||
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
|
||||||
{
|
{
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
{sycl::aspect::fp16});
|
{sycl::aspect::fp16});
|
||||||
|
@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dequantize_block_q2_K(vx, y, item_ct1);
|
dequantize_block_q2_K(vx, y, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
{
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
|
|
||||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
||||||
sycl::range<3>(1, 1, 32),
|
|
||||||
sycl::range<3>(1, 1, 32)),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
dequantize_block_q2_K(vx, y, item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
|
||||||
{
|
{
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
{sycl::aspect::fp16});
|
{sycl::aspect::fp16});
|
||||||
|
@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dequantize_block_q3_K(vx, y, item_ct1);
|
dequantize_block_q3_K(vx, y, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
{
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
|
|
||||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
||||||
sycl::range<3>(1, 1, 32),
|
|
||||||
sycl::range<3>(1, 1, 32)),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
dequantize_block_q3_K(vx, y, item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
|
@ -10320,7 +9889,6 @@ template <typename dst_t>
|
||||||
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
|
||||||
{
|
{
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
{sycl::aspect::fp16});
|
{sycl::aspect::fp16});
|
||||||
|
@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dequantize_block_q5_K(vx, y, item_ct1);
|
dequantize_block_q5_K(vx, y, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
{
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
|
|
||||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
||||||
sycl::range<3>(1, 1, 32),
|
|
||||||
sycl::range<3>(1, 1, 32)),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
dequantize_block_q5_K(vx, y, item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
|
||||||
{
|
{
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
{sycl::aspect::fp16});
|
{sycl::aspect::fp16});
|
||||||
|
@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dequantize_block_q6_K(vx, y, item_ct1);
|
dequantize_block_q6_K(vx, y, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
{
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
|
|
||||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
|
|
||||||
sycl::range<3>(1, 1, 32),
|
|
||||||
sycl::range<3>(1, 1, 32)),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
dequantize_block_q6_K(vx, y, item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename dst_t>
|
template <typename dst_t>
|
||||||
|
@ -10529,9 +10068,6 @@ template <typename dst_t>
|
||||||
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
#if QK_K == 64
|
|
||||||
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
|
|
||||||
#else
|
|
||||||
{
|
{
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
{sycl::aspect::fp16});
|
{sycl::aspect::fp16});
|
||||||
|
@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
const int nrows_y, const int nrows_dst,
|
const int nrows_y, const int nrows_dst,
|
||||||
dpct::queue_ptr stream) try {
|
dpct::queue_ptr stream) try {
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
SYCL_CHECK(
|
SYCL_CHECK(
|
||||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||||
|
@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
@ -14454,6 +13986,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
ggml_tensor *dst, const float *src0_dd,
|
ggml_tensor *dst, const float *src0_dd,
|
||||||
const float *src1_dd, float *dst_dd,
|
const float *src1_dd, float *dst_dd,
|
||||||
const dpct::queue_ptr &main_stream) {
|
const dpct::queue_ptr &main_stream) {
|
||||||
|
#pragma message("TODO: implement phi3 frequency factors support")
|
||||||
|
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
||||||
|
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||||
|
|
File diff suppressed because it is too large
Load diff
255
ggml-vulkan.cpp
255
ggml-vulkan.cpp
|
@ -290,6 +290,7 @@ struct vk_op_rope_neox_push_constants {
|
||||||
float corr_dims[4];
|
float corr_dims[4];
|
||||||
float theta_scale;
|
float theta_scale;
|
||||||
float inv_ndims;
|
float inv_ndims;
|
||||||
|
uint32_t has_freq_facs;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct vk_op_soft_max_push_constants {
|
struct vk_op_soft_max_push_constants {
|
||||||
|
@ -1522,8 +1523,8 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
||||||
}
|
}
|
||||||
|
@ -3732,7 +3733,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
|
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||||
|
@ -3853,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_UNUSED(src2);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
||||||
|
@ -3880,12 +3883,15 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename PC>
|
template<typename PC>
|
||||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||||
if (src1 != nullptr) {
|
if (src1 != nullptr) {
|
||||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||||
}
|
}
|
||||||
|
if (src2 != nullptr) {
|
||||||
|
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
||||||
|
}
|
||||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
||||||
|
@ -3896,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
const uint64_t ne02 = src0->ne[2];
|
const uint64_t ne02 = src0->ne[2];
|
||||||
const uint64_t ne03 = src0->ne[3];
|
const uint64_t ne03 = src0->ne[3];
|
||||||
const uint64_t ne0 = ne00 * ne01;
|
const uint64_t ne0 = ne00 * ne01;
|
||||||
|
|
||||||
const bool use_src1 = src1 != nullptr;
|
const bool use_src1 = src1 != nullptr;
|
||||||
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
||||||
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
||||||
|
@ -3904,7 +3911,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
const uint64_t ne1 = ne10 * ne11;
|
const uint64_t ne1 = ne10 * ne11;
|
||||||
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
||||||
|
|
||||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
|
const bool use_src2 = src2 != nullptr;
|
||||||
|
const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
|
||||||
|
const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
|
||||||
|
const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
|
||||||
|
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
||||||
|
const uint64_t ne2 = ne20 * ne21;
|
||||||
|
|
||||||
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
||||||
ggml_vk_func_t op_func;
|
ggml_vk_func_t op_func;
|
||||||
|
|
||||||
if (pipeline == nullptr) {
|
if (pipeline == nullptr) {
|
||||||
|
@ -3927,15 +3941,18 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||||
|
ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
|
||||||
|
|
||||||
vk_buffer d_X = nullptr;
|
vk_buffer d_X = nullptr;
|
||||||
size_t x_buf_offset = 0;
|
size_t x_buf_offset = 0;
|
||||||
vk_buffer d_Y = nullptr;
|
vk_buffer d_Y = nullptr;
|
||||||
size_t y_buf_offset = 0;
|
size_t y_buf_offset = 0;
|
||||||
vk_buffer d_Z = nullptr;
|
vk_buffer d_Z = nullptr;
|
||||||
|
size_t z_buf_offset = 0;
|
||||||
|
|
||||||
bool src0_uma = false;
|
bool src0_uma = false;
|
||||||
bool src1_uma = false;
|
bool src1_uma = false;
|
||||||
|
bool src2_uma = false;
|
||||||
|
|
||||||
if (ctx->device->uma) {
|
if (ctx->device->uma) {
|
||||||
ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
|
ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
|
||||||
|
@ -3944,10 +3961,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
|
ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
|
||||||
src1_uma = d_Y != nullptr;
|
src1_uma = d_Y != nullptr;
|
||||||
}
|
}
|
||||||
|
if (use_src2) {
|
||||||
|
ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
|
||||||
|
src2_uma = d_Z != nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||||
|
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||||
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
||||||
|
|
||||||
vk_buffer d_D = extra->buffer_gpu.lock();
|
vk_buffer d_D = extra->buffer_gpu.lock();
|
||||||
|
@ -3970,10 +3992,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
y_buf_offset = extra_src1->offset;
|
y_buf_offset = extra_src1->offset;
|
||||||
GGML_ASSERT(d_Y != nullptr);
|
GGML_ASSERT(d_Y != nullptr);
|
||||||
}
|
}
|
||||||
|
if (use_src2 && !src2_uma) {
|
||||||
|
d_Z = extra_src2->buffer_gpu.lock();
|
||||||
|
z_buf_offset = extra_src2->offset;
|
||||||
|
GGML_ASSERT(d_Z != nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
if (op_supports_incontiguous) {
|
if (op_supports_incontiguous) {
|
||||||
x_sz = ggml_nbytes(src0);
|
x_sz = ggml_nbytes(src0);
|
||||||
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
||||||
|
z_sz = use_src2 ? ggml_nbytes(src2) : 0;
|
||||||
d_sz = ggml_nbytes(dst);
|
d_sz = ggml_nbytes(dst);
|
||||||
|
|
||||||
if (x_buf_offset + x_sz >= d_X->size) {
|
if (x_buf_offset + x_sz >= d_X->size) {
|
||||||
|
@ -3982,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
||||||
y_sz = VK_WHOLE_SIZE;
|
y_sz = VK_WHOLE_SIZE;
|
||||||
}
|
}
|
||||||
|
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
||||||
|
z_sz = VK_WHOLE_SIZE;
|
||||||
|
}
|
||||||
if (d_buf_offset + d_sz >= d_D->size) {
|
if (d_buf_offset + d_sz >= d_D->size) {
|
||||||
d_sz = VK_WHOLE_SIZE;
|
d_sz = VK_WHOLE_SIZE;
|
||||||
}
|
}
|
||||||
|
@ -4021,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
||||||
y_sz *= ne12 * ne13;
|
y_sz *= ne12 * ne13;
|
||||||
}
|
}
|
||||||
|
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
||||||
|
z_sz *= ne22 * ne23;
|
||||||
|
}
|
||||||
if (d_sz != VK_WHOLE_SIZE) {
|
if (d_sz != VK_WHOLE_SIZE) {
|
||||||
d_sz *= ne02 * ne03;
|
d_sz *= ne02 * ne03;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (op == GGML_OP_SOFT_MAX) {
|
if (op == GGML_OP_SOFT_MAX) {
|
||||||
// Empty src1 is possible on soft_max, but the shader needs a buffer
|
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
||||||
vk_subbuffer subbuf_y;
|
vk_subbuffer subbuf_y;
|
||||||
if (use_src1) {
|
if (use_src1) {
|
||||||
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
||||||
|
@ -4037,6 +4071,28 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
|
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||||
|
} else if (op == GGML_OP_ROPE) {
|
||||||
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
|
const bool is_neox = mode & 2;
|
||||||
|
|
||||||
|
if (is_neox) {
|
||||||
|
// Empty src2 is possible in rope, but the shader needs a buffer
|
||||||
|
vk_subbuffer subbuf_z;
|
||||||
|
if (use_src2) {
|
||||||
|
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
||||||
|
} else {
|
||||||
|
subbuf_z = { d_X, 0, d_X->size };
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||||
|
} else {
|
||||||
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||||
|
}
|
||||||
|
} else if (use_src2) {
|
||||||
|
ggml_vk_sync_buffers(subctx);
|
||||||
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||||
} else if (use_src1) {
|
} else if (use_src1) {
|
||||||
ggml_vk_sync_buffers(subctx);
|
ggml_vk_sync_buffers(subctx);
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||||
|
@ -4047,6 +4103,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
||||||
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
||||||
|
GGML_ASSERT(!use_src2);
|
||||||
|
|
||||||
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
||||||
|
|
||||||
|
@ -4088,7 +4145,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
@ -4096,7 +4153,7 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, {
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
||||||
(uint32_t)ggml_nelements(src0),
|
(uint32_t)ggml_nelements(src0),
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||||
|
@ -4111,7 +4168,7 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ADD, {
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
|
||||||
(uint32_t)ggml_nelements(src0),
|
(uint32_t)ggml_nelements(src0),
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||||
|
@ -4126,7 +4183,7 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_MUL, {
|
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
|
||||||
(uint32_t)ggml_nelements(src0),
|
(uint32_t)ggml_nelements(src0),
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||||
|
@ -4141,7 +4198,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
||||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, {
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
|
||||||
(uint32_t)ggml_nelements(src0),
|
(uint32_t)ggml_nelements(src0),
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||||
|
@ -4154,7 +4211,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, {
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
|
||||||
(uint32_t)ggml_nelements(src0),
|
(uint32_t)ggml_nelements(src0),
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||||
|
@ -4168,7 +4225,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
||||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, {
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
|
||||||
(uint32_t)ggml_nelements(src0),
|
(uint32_t)ggml_nelements(src0),
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||||
|
@ -4183,7 +4240,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
||||||
(uint32_t)ggml_nelements(src0),
|
(uint32_t)ggml_nelements(src0),
|
||||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||||
|
@ -4195,21 +4252,21 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||||
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
float * op_params = (float *)dst->op_params;
|
float * op_params = (float *)dst->op_params;
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
float * op_params = (float *)dst->op_params;
|
float * op_params = (float *)dst->op_params;
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
int32_t * op_params = (int32_t *)dst->op_params;
|
int32_t * op_params = (int32_t *)dst->op_params;
|
||||||
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
@ -4228,7 +4285,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, {
|
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
|
||||||
ncols,
|
ncols,
|
||||||
src1 != nullptr ? nrows_y : (uint32_t)0,
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
||||||
scale, max_bias,
|
scale, max_bias,
|
||||||
|
@ -4237,7 +4294,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||||
|
@ -4260,12 +4317,13 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
||||||
if (is_neox) {
|
if (is_neox) {
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
const float inv_ndims = -1.0f / n_dims;
|
const float inv_ndims = -1.0f / n_dims;
|
||||||
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
|
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
||||||
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
||||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims
|
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
|
||||||
|
src2 != nullptr,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
|
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
||||||
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
||||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
||||||
});
|
});
|
||||||
|
@ -4288,7 +4346,7 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||||
|
|
||||||
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, {
|
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
||||||
ncols,
|
ncols,
|
||||||
ncols_pad,
|
ncols_pad,
|
||||||
op_params[0],
|
op_params[0],
|
||||||
|
@ -5404,6 +5462,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
|
|
||||||
const ggml_tensor * src0 = node->src[0];
|
const ggml_tensor * src0 = node->src[0];
|
||||||
const ggml_tensor * src1 = node->src[1];
|
const ggml_tensor * src1 = node->src[1];
|
||||||
|
const ggml_tensor * src2 = node->src[2];
|
||||||
|
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
|
@ -5520,7 +5579,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
|
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
|
@ -6496,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
||||||
for (int j = 0; j < level; j++) {
|
for (int j = 0; j < level; j++) {
|
||||||
std::cerr << " ";
|
std::cerr << " ";
|
||||||
}
|
}
|
||||||
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl;
|
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
|
||||||
|
|
||||||
done.push_back(tensor);
|
done.push_back(tensor);
|
||||||
|
|
||||||
|
@ -6546,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
||||||
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
||||||
void * tensor_data = tensor->data;
|
void * tensor_data = tensor->data;
|
||||||
|
|
||||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
||||||
const size_t tensor_size = ggml_nbytes(tensor);
|
const size_t tensor_size = ggml_nbytes(tensor);
|
||||||
tensor_data = malloc(tensor_size);
|
tensor_data = malloc(tensor_size);
|
||||||
|
|
||||||
|
@ -6557,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
||||||
std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
||||||
if (tensor->src[0] != nullptr) {
|
if (tensor->src[0] != nullptr) {
|
||||||
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
|
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
|
||||||
}
|
}
|
||||||
if (tensor->src[1] != nullptr) {
|
if (tensor->src[1] != nullptr) {
|
||||||
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
||||||
}
|
}
|
||||||
std::cerr << std::endl << "Result:" << std::endl;
|
std::cerr << std::endl << "Result:" << std::endl;
|
||||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
||||||
|
@ -6573,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
||||||
std::vector<const ggml_tensor *> done;
|
std::vector<const ggml_tensor *> done;
|
||||||
ggml_vk_print_graph_origin(tensor, done);
|
ggml_vk_print_graph_origin(tensor, done);
|
||||||
|
|
||||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
||||||
free(tensor_data);
|
free(tensor_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
|
||||||
return;
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
|
||||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
|
||||||
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
|
||||||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
|
||||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
|
||||||
float val = 0.0f;
|
|
||||||
if (tensor->type == GGML_TYPE_F32) {
|
|
||||||
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
|
||||||
} else if (tensor->type == GGML_TYPE_F16) {
|
|
||||||
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
|
||||||
}
|
|
||||||
if (std::isnan(val)) {
|
|
||||||
std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
|
|
||||||
std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
|
||||||
std::cerr << std::endl;
|
|
||||||
ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
|
|
||||||
std::cerr << std::endl;
|
|
||||||
std::vector<const ggml_tensor *> done;
|
|
||||||
ggml_vk_print_graph_origin(tensor, done);
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void * comp_result;
|
void * comp_result;
|
||||||
size_t comp_size;
|
size_t comp_size;
|
||||||
size_t comp_nb[GGML_MAX_DIMS];
|
size_t comp_nb[GGML_MAX_DIMS];
|
||||||
|
@ -6633,6 +6660,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
ggml_tensor * src0 = tensor->src[0];
|
ggml_tensor * src0 = tensor->src[0];
|
||||||
ggml_tensor * src1 = tensor->src[1];
|
ggml_tensor * src1 = tensor->src[1];
|
||||||
|
ggml_tensor * src2 = tensor->src[2];
|
||||||
|
|
||||||
struct ggml_init_params iparams = {
|
struct ggml_init_params iparams = {
|
||||||
/*.mem_size =*/ 1024*1024*1024,
|
/*.mem_size =*/ 1024*1024*1024,
|
||||||
|
@ -6662,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
src0_buffer = malloc(src0_size);
|
src0_buffer = malloc(src0_size);
|
||||||
src0_clone->data = src0_buffer;
|
src0_clone->data = src0_buffer;
|
||||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
if (ggml_backend_buffer_is_host(src0->buffer)) {
|
||||||
memcpy(src0_clone->data, src0->data, src0_size);
|
memcpy(src0_clone->data, src0->data, src0_size);
|
||||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||||
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
||||||
uint64_t offset = extra->offset;
|
uint64_t offset = extra->offset;
|
||||||
|
@ -6696,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||||
ggml_vk_print_tensor(ctx, src0, "src0");
|
ggml_vk_print_tensor(ctx, src0, "src0");
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
|
|
||||||
}
|
}
|
||||||
if (src1 != nullptr) {
|
if (src1 != nullptr) {
|
||||||
src1_clone = ggml_dup_tensor(ggml_ctx, src1);
|
src1_clone = ggml_dup_tensor(ggml_ctx, src1);
|
||||||
|
@ -6706,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
src1_buffer = malloc(src1_size);
|
src1_buffer = malloc(src1_size);
|
||||||
src1_clone->data = src1_buffer;
|
src1_clone->data = src1_buffer;
|
||||||
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
if (ggml_backend_buffer_is_host(src1->buffer)) {
|
||||||
memcpy(src1_clone->data, src1->data, src1_size);
|
memcpy(src1_clone->data, src1->data, src1_size);
|
||||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||||
} else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
||||||
uint64_t offset = extra->offset;
|
uint64_t offset = extra->offset;
|
||||||
|
@ -6740,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||||
ggml_vk_print_tensor(ctx, src1, "src1");
|
ggml_vk_print_tensor(ctx, src1, "src1");
|
||||||
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
||||||
std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
||||||
if (src1->src[0] != nullptr) {
|
if (src1->src[0] != nullptr) {
|
||||||
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
||||||
}
|
}
|
||||||
if (src1->src[1] != nullptr) {
|
if (src1->src[1] != nullptr) {
|
||||||
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
||||||
}
|
}
|
||||||
std::cerr << std::endl << "Result:" << std::endl;
|
std::cerr << std::endl << "Result:" << std::endl;
|
||||||
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
||||||
|
@ -6756,8 +6782,64 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
std::vector<const ggml_tensor *> done;
|
std::vector<const ggml_tensor *> done;
|
||||||
ggml_vk_print_graph_origin(src1_clone, done);
|
ggml_vk_print_graph_origin(src1_clone, done);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (src2 != nullptr) {
|
||||||
|
src2_clone = ggml_dup_tensor(ggml_ctx, src2);
|
||||||
|
|
||||||
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
|
src2_size = ggml_nbytes(src2);
|
||||||
|
|
||||||
|
src2_buffer = malloc(src2_size);
|
||||||
|
src2_clone->data = src2_buffer;
|
||||||
|
if (ggml_backend_buffer_is_host(src2->buffer)) {
|
||||||
|
memcpy(src2_clone->data, src2->data, src2_size);
|
||||||
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||||
|
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
||||||
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||||
|
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
||||||
|
uint64_t offset = extra->offset;
|
||||||
|
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
||||||
|
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
||||||
|
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
||||||
|
const int idx = i3*src2->ne[2] + i2;
|
||||||
|
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
src2_clone->nb[0] = src2->nb[0];
|
||||||
|
src2_clone->nb[1] = src2->nb[1];
|
||||||
|
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||||
|
src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (offset + src2_size >= buffer_gpu->size) {
|
||||||
|
src2_size = buffer_gpu->size - offset;
|
||||||
|
}
|
||||||
|
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
|
||||||
|
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||||
|
ggml_vk_print_tensor(ctx, src2, "src2");
|
||||||
|
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
||||||
|
std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
||||||
|
if (src2->src[0] != nullptr) {
|
||||||
|
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
||||||
|
}
|
||||||
|
if (src2->src[1] != nullptr) {
|
||||||
|
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
||||||
|
}
|
||||||
|
std::cerr << std::endl << "Result:" << std::endl;
|
||||||
|
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
||||||
|
std::cerr << std::endl;
|
||||||
|
std::cerr << std::endl << "Result:" << std::endl;
|
||||||
|
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
|
||||||
|
std::cerr << std::endl;
|
||||||
|
std::vector<const ggml_tensor *> done;
|
||||||
|
ggml_vk_print_graph_origin(src2_clone, done);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
|
@ -6795,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
float attn_factor = ((float *) tensor->op_params)[8];
|
float attn_factor = ((float *) tensor->op_params)[8];
|
||||||
float beta_fast = ((float *) tensor->op_params)[9];
|
float beta_fast = ((float *) tensor->op_params)[9];
|
||||||
float beta_slow = ((float *) tensor->op_params)[10];
|
float beta_slow = ((float *) tensor->op_params)[10];
|
||||||
tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
} else if (tensor->op == GGML_OP_UNARY) {
|
} else if (tensor->op == GGML_OP_UNARY) {
|
||||||
switch (ggml_get_unary_op(tensor)) {
|
switch (ggml_get_unary_op(tensor)) {
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
|
@ -6843,7 +6925,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
||||||
|
|
||||||
ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
|
|
||||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||||
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
||||||
}
|
}
|
||||||
|
@ -6884,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
void * tensor_data = tensor->data;
|
void * tensor_data = tensor->data;
|
||||||
|
|
||||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
||||||
size_t tensor_size = ggml_nbytes(tensor);
|
size_t tensor_size = ggml_nbytes(tensor);
|
||||||
tensor_data = malloc(tensor_size);
|
tensor_data = malloc(tensor_size);
|
||||||
|
|
||||||
|
@ -6932,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
||||||
std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
|
std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
|
||||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||||
if (src0 != nullptr) {
|
if (src0 != nullptr) {
|
||||||
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||||
}
|
}
|
||||||
if (src1 != nullptr) {
|
if (src1 != nullptr) {
|
||||||
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||||
}
|
}
|
||||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||||
std::cerr << std::endl << "Result:" << std::endl;
|
std::cerr << std::endl << "Result:" << std::endl;
|
||||||
|
@ -6973,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||||
std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
||||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||||
if (src0 != nullptr) {
|
if (src0 != nullptr) {
|
||||||
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||||
}
|
}
|
||||||
if (src1 != nullptr) {
|
if (src1 != nullptr) {
|
||||||
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||||
}
|
}
|
||||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||||
std::cerr << std::endl << "Result:" << std::endl;
|
std::cerr << std::endl << "Result:" << std::endl;
|
||||||
|
@ -6997,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
|
|
||||||
if (avg_err > 0.05 || std::isnan(avg_err)) {
|
if (avg_err > 0.05 || std::isnan(avg_err)) {
|
||||||
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
||||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||||
if (src0 != nullptr) {
|
if (src0 != nullptr) {
|
||||||
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||||
}
|
}
|
||||||
if (src1 != nullptr) {
|
if (src1 != nullptr) {
|
||||||
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||||
}
|
}
|
||||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||||
std::cerr << std::endl << "Result:" << std::endl;
|
std::cerr << std::endl << "Result:" << std::endl;
|
||||||
|
@ -7014,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||||
ggml_vk_print_graph_origin(tensor, done);
|
ggml_vk_print_graph_origin(tensor, done);
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
} else {
|
} else {
|
||||||
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl;
|
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
free(comp_result);
|
free(comp_result);
|
||||||
comp_result = nullptr;
|
comp_result = nullptr;
|
||||||
comp_size = 0;
|
comp_size = 0;
|
||||||
|
|
||||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
||||||
free(tensor_data);
|
free(tensor_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
69
ggml.h
69
ggml.h
|
@ -481,9 +481,7 @@ extern "C" {
|
||||||
GGML_OP_ARGSORT,
|
GGML_OP_ARGSORT,
|
||||||
GGML_OP_LEAKY_RELU,
|
GGML_OP_LEAKY_RELU,
|
||||||
|
|
||||||
GGML_OP_FLASH_ATTN,
|
|
||||||
GGML_OP_FLASH_ATTN_EXT,
|
GGML_OP_FLASH_ATTN_EXT,
|
||||||
GGML_OP_FLASH_FF,
|
|
||||||
GGML_OP_FLASH_ATTN_BACK,
|
GGML_OP_FLASH_ATTN_BACK,
|
||||||
GGML_OP_SSM_CONV,
|
GGML_OP_SSM_CONV,
|
||||||
GGML_OP_SSM_SCAN,
|
GGML_OP_SSM_SCAN,
|
||||||
|
@ -1460,11 +1458,12 @@ extern "C" {
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// rotary position embedding
|
// rotary position embedding
|
||||||
// if mode & 1 == 1, skip n_past elements (DEPRECATED)
|
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
||||||
// if mode & 2 == 1, GPT-NeoX style
|
// if mode & 2 == 1, GPT-NeoX style
|
||||||
// if mode & 4 == 1, ChatGLM style
|
// if mode & 4 == 1, ChatGLM style
|
||||||
//
|
//
|
||||||
// b is an int32 vector with size a->ne[2], it contains the positions
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
||||||
|
// c is freq factors (e.g. phi3-128k), (optional)
|
||||||
GGML_API struct ggml_tensor * ggml_rope(
|
GGML_API struct ggml_tensor * ggml_rope(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -1483,10 +1482,11 @@ extern "C" {
|
||||||
int n_ctx);
|
int n_ctx);
|
||||||
|
|
||||||
// custom RoPE
|
// custom RoPE
|
||||||
GGML_API struct ggml_tensor * ggml_rope_custom(
|
GGML_API struct ggml_tensor * ggml_rope_ext(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
@ -1499,7 +1499,23 @@ extern "C" {
|
||||||
float beta_slow);
|
float beta_slow);
|
||||||
|
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
int n_dims,
|
||||||
|
int mode,
|
||||||
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow);
|
||||||
|
|
||||||
|
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
|
@ -1512,20 +1528,28 @@ extern "C" {
|
||||||
float ext_factor,
|
float ext_factor,
|
||||||
float attn_factor,
|
float attn_factor,
|
||||||
float beta_fast,
|
float beta_fast,
|
||||||
float beta_slow);
|
float beta_slow),
|
||||||
|
"use ggml_rope_ext instead");
|
||||||
|
|
||||||
// compute correction dims for YaRN RoPE scaling
|
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||||
GGML_CALL void ggml_rope_yarn_corr_dims(
|
|
||||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
|
||||||
|
|
||||||
// xPos RoPE, in-place, returns view(a)
|
|
||||||
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
float base,
|
int mode,
|
||||||
bool down);
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow),
|
||||||
|
"use ggml_rope_ext_inplace instead");
|
||||||
|
|
||||||
|
// compute correction dims for YaRN RoPE scaling
|
||||||
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
||||||
|
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||||
|
|
||||||
// rotary position embedding backward, i.e compute dx from dy
|
// rotary position embedding backward, i.e compute dx from dy
|
||||||
// a - dy
|
// a - dy
|
||||||
|
@ -1533,6 +1557,7 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
@ -1734,13 +1759,6 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int k);
|
int k);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * q,
|
|
||||||
struct ggml_tensor * k,
|
|
||||||
struct ggml_tensor * v,
|
|
||||||
bool masked);
|
|
||||||
|
|
||||||
#define GGML_KQ_MASK_PAD 32
|
#define GGML_KQ_MASK_PAD 32
|
||||||
|
|
||||||
// q: [n_embd, n_batch, n_head, 1]
|
// q: [n_embd, n_batch, n_head, 1]
|
||||||
|
@ -1761,6 +1779,7 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
enum ggml_prec prec);
|
enum ggml_prec prec);
|
||||||
|
|
||||||
|
// TODO: needs to be adapted to ggml_flash_attn_ext
|
||||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * q,
|
struct ggml_tensor * q,
|
||||||
|
@ -1769,14 +1788,6 @@ extern "C" {
|
||||||
struct ggml_tensor * d,
|
struct ggml_tensor * d,
|
||||||
bool masked);
|
bool masked);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_flash_ff(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * a,
|
|
||||||
struct ggml_tensor * b0,
|
|
||||||
struct ggml_tensor * b1,
|
|
||||||
struct ggml_tensor * c0,
|
|
||||||
struct ggml_tensor * c1);
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * s,
|
struct ggml_tensor * s,
|
||||||
|
|
|
@ -2609,7 +2609,8 @@ layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
|
||||||
|
|
||||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
layout (binding = 1) readonly buffer Y {int data_b[];};
|
layout (binding = 1) readonly buffer Y {int data_b[];};
|
||||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
layout (binding = 2) readonly buffer Z {float data_freq_factors[];};
|
||||||
|
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
layout (push_constant) uniform parameter {
|
layout (push_constant) uniform parameter {
|
||||||
uint ncols;
|
uint ncols;
|
||||||
|
@ -2622,6 +2623,7 @@ layout (push_constant) uniform parameter {
|
||||||
float corr_dims[4];
|
float corr_dims[4];
|
||||||
float theta_scale;
|
float theta_scale;
|
||||||
float inv_ndims;
|
float inv_ndims;
|
||||||
|
uint has_freq_facs;
|
||||||
} p;
|
} p;
|
||||||
|
|
||||||
float rope_yarn_ramp(const float low, const float high, const uint i0) {
|
float rope_yarn_ramp(const float low, const float high, const uint i0) {
|
||||||
|
@ -2671,7 +2673,8 @@ void main() {
|
||||||
const float cur_rot = p.inv_ndims * ic - ib;
|
const float cur_rot = p.inv_ndims * ic - ib;
|
||||||
|
|
||||||
const int pos = data_b[i2];
|
const int pos = data_b[i2];
|
||||||
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f);
|
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
|
||||||
|
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta_base, uint(cur_rot), cos_theta, sin_theta);
|
rope_yarn(theta_base, uint(cur_rot), cos_theta, sin_theta);
|
||||||
|
|
|
@ -61,6 +61,7 @@ class Keys:
|
||||||
FREQ_BASE = "{arch}.rope.freq_base"
|
FREQ_BASE = "{arch}.rope.freq_base"
|
||||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||||
|
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
|
||||||
|
@ -148,6 +149,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
OUTPUT = auto()
|
OUTPUT = auto()
|
||||||
OUTPUT_NORM = auto()
|
OUTPUT_NORM = auto()
|
||||||
ROPE_FREQS = auto()
|
ROPE_FREQS = auto()
|
||||||
|
ROPE_FACTORS_LONG = auto()
|
||||||
|
ROPE_FACTORS_SHORT = auto()
|
||||||
ATTN_Q = auto()
|
ATTN_Q = auto()
|
||||||
ATTN_K = auto()
|
ATTN_K = auto()
|
||||||
ATTN_V = auto()
|
ATTN_V = auto()
|
||||||
|
@ -225,6 +228,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
|
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||||
|
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||||
|
@ -900,9 +905,8 @@ class GGUFValueType(IntEnum):
|
||||||
raise ValueError(f"Unknown type: {type(val)}")
|
raise ValueError(f"Unknown type: {type(val)}")
|
||||||
|
|
||||||
|
|
||||||
# Note: Does not support GGML_QKK_64
|
|
||||||
QK_K = 256
|
|
||||||
# Items here are (block size, type size)
|
# Items here are (block size, type size)
|
||||||
|
QK_K = 256
|
||||||
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||||
GGMLQuantizationType.F32: (1, 4),
|
GGMLQuantizationType.F32: (1, 4),
|
||||||
GGMLQuantizationType.F16: (1, 2),
|
GGMLQuantizationType.F16: (1, 2),
|
||||||
|
|
|
@ -433,6 +433,9 @@ class GGUFWriter:
|
||||||
def add_rope_scaling_factor(self, value: float) -> None:
|
def add_rope_scaling_factor(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
|
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None:
|
||||||
|
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
|
def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
|
self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -759,6 +759,12 @@ extern "C" {
|
||||||
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
||||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
||||||
|
|
||||||
|
// Get the number of threads used for generation of a single token.
|
||||||
|
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||||
|
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
// Set whether to use causal attention or not
|
// Set whether to use causal attention or not
|
||||||
// If set to true, the model will only attend to the past tokens
|
// If set to true, the model will only attend to the past tokens
|
||||||
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||||
|
|
|
@ -1142,20 +1142,22 @@ struct test_rope : public test_case {
|
||||||
int n_dims;
|
int n_dims;
|
||||||
int mode;
|
int mode;
|
||||||
int n_ctx;
|
int n_ctx;
|
||||||
|
bool ff;
|
||||||
|
|
||||||
std::string vars() override {
|
std::string vars() override {
|
||||||
return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
|
return VARS_TO_STR6(type, ne, n_dims, mode, n_ctx, ff);
|
||||||
}
|
}
|
||||||
|
|
||||||
test_rope(ggml_type type = GGML_TYPE_F32,
|
test_rope(ggml_type type = GGML_TYPE_F32,
|
||||||
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
||||||
int n_dims = 10, int mode = 0, int n_ctx = 512)
|
int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false)
|
||||||
: type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
|
: type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx), ff(ff) {}
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||||
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
|
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
|
||||||
ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
|
ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
|
||||||
|
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1168,11 +1170,16 @@ struct test_rope : public test_case {
|
||||||
data[i] = rand() % n_ctx;
|
data[i] = rand() % n_ctx;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
|
ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
|
||||||
|
} else {
|
||||||
|
if (t->ne[0] == n_dims/2) {
|
||||||
|
// frequency factors in the range [0.9f, 1.1f]
|
||||||
|
init_tensor_uniform(t, 0.9f, 1.1f);
|
||||||
} else {
|
} else {
|
||||||
init_tensor_uniform(t);
|
init_tensor_uniform(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// GGML_OP_POOL2D
|
// GGML_OP_POOL2D
|
||||||
|
@ -1763,14 +1770,14 @@ struct test_llama : public test_llm {
|
||||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_ext(
|
||||||
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos,
|
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, nullptr,
|
||||||
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
|
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_ext(
|
||||||
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
|
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
|
||||||
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
|
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
@ -1889,13 +1896,13 @@ struct test_falcon : public test_llm {
|
||||||
Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
|
Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
|
||||||
|
|
||||||
// using mode = 2 for neox mode
|
// using mode = 2 for neox mode
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_ext(
|
||||||
ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
|
ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_ext(
|
||||||
ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
|
ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -2188,16 +2195,20 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
||||||
|
|
||||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
|
// TODO: ff not supported yet for !neox
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512)); // llama 13B
|
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, false)); // llama 7B
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512)); // llama 30B
|
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, false)); // llama 13B
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512)); // llama 65B
|
test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, false)); // llama 30B
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512)); // neox (falcon 7B)
|
test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, false)); // llama 65B
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512)); // neox (falcon 7B)
|
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
|
for (bool ff : {false, true}) { // freq_factors
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B)
|
test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
|
||||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm)
|
test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
|
||||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2)
|
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, ff)); // neox (stablelm)
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, ff)); // neox (phi-2)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||||
|
|
|
@ -49,8 +49,14 @@ int main(void) {
|
||||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
||||||
// Llama-3
|
// Llama-3
|
||||||
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
||||||
// Phi-3
|
//Phi-3-mini
|
||||||
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
|
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
||||||
|
//Phi-3-small
|
||||||
|
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
||||||
|
//Phi-3-medium
|
||||||
|
"{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
||||||
|
//Phi-3-vision
|
||||||
|
"{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
|
||||||
};
|
};
|
||||||
std::vector<std::string> expected_output = {
|
std::vector<std::string> expected_output = {
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
|
@ -79,8 +85,14 @@ int main(void) {
|
||||||
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
||||||
// Llama 3
|
// Llama 3
|
||||||
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
// Phi 3
|
//Phi-3-mini
|
||||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
|
//Phi-3-small
|
||||||
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
|
//Phi-3-medium
|
||||||
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
|
//Phi-3-vision
|
||||||
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
};
|
};
|
||||||
std::vector<char> formatted_chat(1024);
|
std::vector<char> formatted_chat(1024);
|
||||||
int32_t res;
|
int32_t res;
|
||||||
|
|
|
@ -1515,90 +1515,50 @@ int main(int argc, const char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// flash_attn f32
|
// flash_attn f32
|
||||||
{
|
// TODO: adapt to ggml_flash_attn_ext() changes
|
||||||
srand(seed);
|
//{
|
||||||
const int nargs = 3;
|
// srand(seed);
|
||||||
|
// const int nargs = 3;
|
||||||
|
|
||||||
int64_t ne2[4];
|
// int64_t ne2[4];
|
||||||
|
|
||||||
get_random_dims(ne2, 4);
|
// get_random_dims(ne2, 4);
|
||||||
int64_t D = ne2[0];
|
// int64_t D = ne2[0];
|
||||||
int64_t N = ne2[1];
|
// int64_t N = ne2[1];
|
||||||
int64_t M = ne2[2] + N;
|
// int64_t M = ne2[2] + N;
|
||||||
int64_t B = ne2[3];
|
// int64_t B = ne2[3];
|
||||||
|
|
||||||
for (int masked = 0; masked <= 1; ++masked) {
|
// for (int masked = 0; masked <= 1; ++masked) {
|
||||||
for (int ndims = 2; ndims <= 4; ++ndims) {
|
// for (int ndims = 2; ndims <= 4; ++ndims) {
|
||||||
int max_nrep = (ndims >= 3) ? 2 : 1;
|
// int max_nrep = (ndims >= 3) ? 2 : 1;
|
||||||
for (int nrep = 1; nrep < max_nrep; ++nrep) {
|
// for (int nrep = 1; nrep < max_nrep; ++nrep) {
|
||||||
int64_t neq[4] = { D, N, B*nrep, ne[3] };
|
// int64_t neq[4] = { D, N, B*nrep, ne[3] };
|
||||||
int64_t nek[4] = { D, M, B, ne[3] };
|
// int64_t nek[4] = { D, M, B, ne[3] };
|
||||||
int64_t nev[4] = { M, D, B, ne[3] };
|
// int64_t nev[4] = { M, D, B, ne[3] };
|
||||||
if (ndims == 2) {
|
// if (ndims == 2) {
|
||||||
neq[2] = 1; neq[3] = 1;
|
// neq[2] = 1; neq[3] = 1;
|
||||||
nek[2] = 1; nek[3] = 1;
|
// nek[2] = 1; nek[3] = 1;
|
||||||
nev[2] = 1; nev[3] = 1;
|
// nev[2] = 1; nev[3] = 1;
|
||||||
} else if (ndims == 3) {
|
// } else if (ndims == 3) {
|
||||||
neq[3] = 1;
|
// neq[3] = 1;
|
||||||
nek[3] = 1;
|
// nek[3] = 1;
|
||||||
nev[3] = 1;
|
// nev[3] = 1;
|
||||||
}
|
// }
|
||||||
x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
// x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
||||||
x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
// x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
||||||
x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
// x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
||||||
ggml_set_param(ctx0, x[0]);
|
// ggml_set_param(ctx0, x[0]);
|
||||||
ggml_set_param(ctx0, x[1]);
|
// ggml_set_param(ctx0, x[1]);
|
||||||
ggml_set_param(ctx0, x[2]);
|
// ggml_set_param(ctx0, x[2]);
|
||||||
|
|
||||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
// struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||||
|
|
||||||
check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
// check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
//}
|
||||||
|
|
||||||
// flash_attn f16, not yet fully implemented
|
|
||||||
if(0)
|
|
||||||
{
|
|
||||||
srand(seed);
|
|
||||||
const int nargs = 3;
|
|
||||||
|
|
||||||
int64_t ne2[4];
|
|
||||||
|
|
||||||
get_random_dims(ne2, 4);
|
|
||||||
int64_t D = ne2[0];
|
|
||||||
int64_t N = ne2[1];
|
|
||||||
int64_t M = ne2[2] + N;
|
|
||||||
int64_t B = ne2[3];
|
|
||||||
|
|
||||||
for (int masked = 0; masked <= 1; ++masked) {
|
|
||||||
for (int ndims = 2; ndims <= 4; ++ndims) {
|
|
||||||
int64_t neq[4] = { D, N, B, ne[3] };
|
|
||||||
int64_t nek[4] = { D, M, B, ne[3] };
|
|
||||||
int64_t nev[4] = { M, D, B, ne[3] };
|
|
||||||
if (ndims == 2) {
|
|
||||||
neq[2] = 1; neq[3] = 1;
|
|
||||||
nek[2] = 1; nek[3] = 1;
|
|
||||||
nev[2] = 1; nev[3] = 1;
|
|
||||||
} else if (ndims == 3) {
|
|
||||||
neq[3] = 1;
|
|
||||||
nek[3] = 1;
|
|
||||||
nev[3] = 1;
|
|
||||||
}
|
|
||||||
x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
|
||||||
x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
|
||||||
x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
|
||||||
ggml_set_param(ctx0, x[0]);
|
|
||||||
ggml_set_param(ctx0, x[1]);
|
|
||||||
ggml_set_param(ctx0, x[2]);
|
|
||||||
|
|
||||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
|
||||||
|
|
||||||
check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ggml_free(ctx0);
|
ggml_free(ctx0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,10 +17,15 @@ make -j tests/test-tokenizer-0
|
||||||
|
|
||||||
printf "Testing %s on %s ...\n" $name $input
|
printf "Testing %s on %s ...\n" $name $input
|
||||||
|
|
||||||
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
|
set -e
|
||||||
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
|
||||||
|
|
||||||
|
printf "Tokenizing using (py) Python AutoTokenizer ...\n"
|
||||||
|
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
|
||||||
|
|
||||||
|
printf "Tokenizing using (cpp) llama.cpp ...\n"
|
||||||
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
|
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
|
||||||
|
|
||||||
|
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
||||||
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
||||||
|
|
||||||
diff $input.tok $input.tokcpp > /dev/null 2>&1
|
diff $input.tok $input.tokcpp > /dev/null 2>&1
|
||||||
|
|
|
@ -154,19 +154,22 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||||
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
||||||
'Cửa Việt', # llama-3, ignore_merges = true
|
'Cửa Việt', # llama-3, ignore_merges = true
|
||||||
'<s>a', # Phi-3 fail
|
'<s>a', # Phi-3 fail
|
||||||
'<unk><|endoftext|><s>' # Phi-3 fail
|
'<unk><|endoftext|><s>', # Phi-3 fail
|
||||||
'a\na', # TODO: Bert fail
|
'a\na', # TODO: Bert fail
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
|
def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||||
special_tokens = set(special_tokens)
|
special_tokens = set(tokenizer.all_special_tokens)
|
||||||
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
|
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
|
||||||
special_tokens = list(sorted(special_tokens))
|
special_tokens = list(sorted(special_tokens))
|
||||||
rand = random.Random()
|
rand = random.Random()
|
||||||
for m in range(iterations):
|
for m in range(iterations):
|
||||||
rand.seed(m)
|
rand.seed(m)
|
||||||
words = rand.choices(special_tokens, k=500)
|
words = rand.choices(special_tokens, k=500)
|
||||||
|
if tokenizer.add_bos_token: # skip spam warning of double BOS
|
||||||
|
while words and words[0] == tokenizer.bos_token:
|
||||||
|
words.pop(0)
|
||||||
yield "".join(words)
|
yield "".join(words)
|
||||||
|
|
||||||
|
|
||||||
|
@ -290,18 +293,19 @@ def main(argv: list[str] = None):
|
||||||
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
|
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
|
||||||
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
||||||
|
|
||||||
def func_tokenize2(text: str):
|
tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", True)
|
||||||
return tokenizer.encode(text, add_special_tokens=False)
|
tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", False)
|
||||||
|
|
||||||
parse_special = all(len(func_tokenize2(t)) == 1 for t in tokenizer.all_special_tokens)
|
|
||||||
|
|
||||||
def func_tokenize1(text: str):
|
def func_tokenize1(text: str):
|
||||||
return model.tokenize(text, add_special=False, parse_special=parse_special)
|
return model.tokenize(text, add_special=True, parse_special=True)
|
||||||
|
|
||||||
|
def func_tokenize2(text: str):
|
||||||
|
return tokenizer.encode(text, add_special_tokens=True)
|
||||||
|
|
||||||
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
||||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue