Merge branch 'master' into HEAD
This commit is contained in:
commit
8f98035e0a
30 changed files with 3486 additions and 493 deletions
|
@ -75,6 +75,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||||
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
|
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
|
@ -225,6 +226,14 @@ if (LLAMA_BLAS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_K_QUANTS)
|
||||||
|
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
||||||
|
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||||
|
if (LLAMA_QKK_64)
|
||||||
|
add_compile_definitions(GGML_QKK_64)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS)
|
||||||
cmake_minimum_required(VERSION 3.17)
|
cmake_minimum_required(VERSION 3.17)
|
||||||
|
|
||||||
|
@ -250,6 +259,15 @@ if (LLAMA_CUBLAS)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
|
if (LLAMA_CUDA_DMMV_F16)
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
||||||
else()
|
else()
|
||||||
message(WARNING "cuBLAS not found")
|
message(WARNING "cuBLAS not found")
|
||||||
endif()
|
endif()
|
||||||
|
@ -280,11 +298,6 @@ if (LLAMA_METAL)
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_K_QUANTS)
|
|
||||||
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
|
|
||||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_CLBLAST)
|
if (LLAMA_CLBLAST)
|
||||||
find_package(CLBlast)
|
find_package(CLBlast)
|
||||||
if (CLBlast_FOUND)
|
if (CLBlast_FOUND)
|
||||||
|
@ -493,22 +506,6 @@ if (BUILD_SHARED_LIBS)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_SOURCES_CUDA)
|
|
||||||
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
|
||||||
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native")
|
|
||||||
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
|
||||||
|
|
||||||
set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
|
|
||||||
set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
|
||||||
set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
|
|
||||||
set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# programs, examples and tests
|
# programs, examples and tests
|
||||||
|
|
9
Makefile
9
Makefile
|
@ -43,8 +43,11 @@ endif
|
||||||
|
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||||
#OPT = -Ofast
|
ifdef LLAMA_FAST
|
||||||
|
OPT = -Ofast
|
||||||
|
else
|
||||||
OPT = -O3
|
OPT = -O3
|
||||||
|
endif
|
||||||
CFLAGS = -I. $(OPT) -std=c11 -fPIC
|
CFLAGS = -I. $(OPT) -std=c11 -fPIC
|
||||||
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
|
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
@ -131,6 +134,10 @@ ifndef LLAMA_NO_K_QUANTS
|
||||||
CFLAGS += -DGGML_USE_K_QUANTS
|
CFLAGS += -DGGML_USE_K_QUANTS
|
||||||
CXXFLAGS += -DGGML_USE_K_QUANTS
|
CXXFLAGS += -DGGML_USE_K_QUANTS
|
||||||
OBJS += k_quants.o
|
OBJS += k_quants.o
|
||||||
|
ifdef LLAMA_QKK_64
|
||||||
|
CFLAGS += -DGGML_QKK_64
|
||||||
|
CXXFLAGS += -DGGML_QKK_64
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef LLAMA_NO_ACCELERATE
|
ifndef LLAMA_NO_ACCELERATE
|
||||||
|
|
29
README.md
29
README.md
|
@ -5,16 +5,16 @@
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
[](https://github.com/ggerganov/llama.cpp/actions)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics:**
|
**Hot topics:**
|
||||||
|
|
||||||
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
|
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
|
||||||
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
|
- New roadmap: https://github.com/users/ggerganov/projects/7
|
||||||
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
|
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
|
||||||
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
|
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
|
||||||
- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
|
|
||||||
- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Table of Contents</summary>
|
<summary>Table of Contents</summary>
|
||||||
|
@ -33,6 +33,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
<li><a href="#quantization">Quantization</a></li>
|
<li><a href="#quantization">Quantization</a></li>
|
||||||
<li><a href="#interactive-mode">Interactive mode</a></li>
|
<li><a href="#interactive-mode">Interactive mode</a></li>
|
||||||
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
|
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
|
||||||
|
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
|
||||||
<li><a href="#using-gpt4all">Using GPT4All</a></li>
|
<li><a href="#using-gpt4all">Using GPT4All</a></li>
|
||||||
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
|
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
|
||||||
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
|
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
|
||||||
|
@ -344,7 +345,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### CLBlast
|
- #### CLBlast
|
||||||
|
|
||||||
|
@ -378,7 +379,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
```sh
|
```sh
|
||||||
git clone https://github.com/CNugteren/CLBlast.git
|
git clone https://github.com/CNugteren/CLBlast.git
|
||||||
mkdir CLBlast/build
|
mkdir CLBlast/build
|
||||||
cd CLBLast/build
|
cd CLBlast/build
|
||||||
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
cmake --install . --prefix /some/path
|
cmake --install . --prefix /some/path
|
||||||
|
@ -547,6 +548,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||||
>
|
>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
|
||||||
|
|
||||||
|
OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
|
||||||
|
|
||||||
|
- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
|
||||||
|
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
|
||||||
|
|
||||||
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
||||||
|
|
||||||
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
|
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
|
||||||
|
@ -676,12 +684,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
|
||||||
```
|
```
|
||||||
GGML_OPENCL_PLATFORM=0
|
GGML_OPENCL_PLATFORM=0
|
||||||
GGML_OPENCL_DEVICE=0
|
GGML_OPENCL_DEVICE=0
|
||||||
export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
|
||||||
./main (...)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
||||||
|
|
||||||
|
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
#### Prerequisites
|
#### Prerequisites
|
||||||
|
|
79
build.zig
79
build.zig
|
@ -1,61 +1,58 @@
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
|
||||||
|
// Zig Version: 0.11.0-dev.3379+629f0d23b
|
||||||
pub fn build(b: *std.build.Builder) void {
|
pub fn build(b: *std.build.Builder) void {
|
||||||
const target = b.standardTargetOptions(.{});
|
const target = b.standardTargetOptions(.{});
|
||||||
const optimize = b.standardReleaseOptions();
|
const optimize = b.standardOptimizeOption(.{});
|
||||||
const want_lto = b.option(bool, "lto", "Want -fLTO");
|
const lib = b.addStaticLibrary(.{
|
||||||
|
.name = "llama",
|
||||||
const lib = b.addStaticLibrary("llama", null);
|
.target = target,
|
||||||
lib.want_lto = want_lto;
|
.optimize = optimize,
|
||||||
lib.setTarget(target);
|
});
|
||||||
lib.setBuildMode(optimize);
|
lib.linkLibC();
|
||||||
lib.linkLibCpp();
|
lib.linkLibCpp();
|
||||||
lib.addIncludePath(".");
|
lib.addIncludePath(".");
|
||||||
lib.addIncludePath("examples");
|
lib.addIncludePath("./examples");
|
||||||
lib.addCSourceFiles(&.{
|
lib.addCSourceFiles(&.{
|
||||||
"ggml.c",
|
"ggml.c",
|
||||||
}, &.{"-std=c11"});
|
}, &.{"-std=c11"});
|
||||||
lib.addCSourceFiles(&.{
|
lib.addCSourceFiles(&.{
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
}, &.{"-std=c++11"});
|
}, &.{"-std=c++11"});
|
||||||
lib.install();
|
b.installArtifact(lib);
|
||||||
|
|
||||||
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
|
const examples = .{
|
||||||
|
"main",
|
||||||
|
"baby-llama",
|
||||||
|
"embedding",
|
||||||
|
// "metal",
|
||||||
|
"perplexity",
|
||||||
|
"quantize",
|
||||||
|
"quantize-stats",
|
||||||
|
"save-load-state",
|
||||||
|
// "server",
|
||||||
|
"simple",
|
||||||
|
"train-text-from-scratch",
|
||||||
|
};
|
||||||
|
|
||||||
const exe = build_example("main", build_args);
|
inline for (examples) |example_name| {
|
||||||
_ = build_example("quantize", build_args);
|
const exe = b.addExecutable(.{
|
||||||
_ = build_example("perplexity", build_args);
|
.name = example_name,
|
||||||
_ = build_example("embedding", build_args);
|
.target = target,
|
||||||
|
.optimize = optimize,
|
||||||
// create "zig build run" command for ./main
|
});
|
||||||
|
|
||||||
const run_cmd = exe.run();
|
|
||||||
run_cmd.step.dependOn(b.getInstallStep());
|
|
||||||
if (b.args) |args| {
|
|
||||||
run_cmd.addArgs(args);
|
|
||||||
}
|
|
||||||
|
|
||||||
const run_step = b.step("run", "Run the app");
|
|
||||||
run_step.dependOn(&run_cmd.step);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
|
|
||||||
const b = args.b;
|
|
||||||
const lib = args.lib;
|
|
||||||
const want_lto = args.want_lto;
|
|
||||||
|
|
||||||
const exe = b.addExecutable(name, null);
|
|
||||||
exe.want_lto = want_lto;
|
|
||||||
lib.setTarget(args.target);
|
|
||||||
lib.setBuildMode(args.optimize);
|
|
||||||
exe.addIncludePath(".");
|
exe.addIncludePath(".");
|
||||||
exe.addIncludePath("examples");
|
exe.addIncludePath("./examples");
|
||||||
exe.addCSourceFiles(&.{
|
exe.addCSourceFiles(&.{
|
||||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
|
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
|
||||||
"examples/common.cpp",
|
"examples/common.cpp",
|
||||||
}, &.{"-std=c++11"});
|
}, &.{"-std=c++11"});
|
||||||
exe.linkLibrary(lib);
|
exe.linkLibrary(lib);
|
||||||
exe.install();
|
b.installArtifact(exe);
|
||||||
|
const run_cmd = b.addRunArtifact(exe);
|
||||||
return exe;
|
run_cmd.step.dependOn(b.getInstallStep());
|
||||||
|
if (b.args) |args| run_cmd.addArgs(args);
|
||||||
|
const run_step = b.step("run_" ++ example_name, "Run the app");
|
||||||
|
run_step.dependOn(&run_cmd.step);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
95
convert.py
95
convert.py
|
@ -130,6 +130,14 @@ TENSORS_LIST = make_tensors_list()
|
||||||
TENSORS_SET = set(TENSORS_LIST)
|
TENSORS_SET = set(TENSORS_LIST)
|
||||||
|
|
||||||
|
|
||||||
|
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||||
|
# hardcoded magic range
|
||||||
|
for n_mult in range(256, 1, -1):
|
||||||
|
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
||||||
|
if calc_ff == n_ff:
|
||||||
|
return n_mult
|
||||||
|
return 1
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
|
@ -137,21 +145,61 @@ class Params:
|
||||||
n_mult: int
|
n_mult: int
|
||||||
n_head: int
|
n_head: int
|
||||||
n_layer: int
|
n_layer: int
|
||||||
file_type: GGMLFileType
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
|
def guessed(model: 'LazyModel') -> 'Params':
|
||||||
n_vocab, n_embd = model["tok_embeddings.weight"].shape
|
# try transformer naming first
|
||||||
|
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
||||||
|
|
||||||
|
# try transformer naming first
|
||||||
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
|
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
|
else:
|
||||||
|
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
|
n_head=n_embd // 128 # guessed
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab=n_vocab,
|
n_vocab=n_vocab,
|
||||||
n_embd=n_embd,
|
n_embd=n_embd,
|
||||||
n_mult=256,
|
n_mult=256,
|
||||||
n_head=n_embd // 128,
|
n_head=n_head,
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
|
n_layer=n_layer,
|
||||||
file_type=file_type,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||||
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
|
n_vocab = config["vocab_size"];
|
||||||
|
n_embd = config["hidden_size"];
|
||||||
|
n_head = config["num_attention_heads"];
|
||||||
|
n_layer = config["num_hidden_layers"];
|
||||||
|
n_ff = config["intermediate_size"];
|
||||||
|
|
||||||
|
n_mult = find_n_mult(n_ff, n_embd);
|
||||||
|
|
||||||
|
return Params(
|
||||||
|
n_vocab=n_vocab,
|
||||||
|
n_embd=n_embd,
|
||||||
|
n_mult=n_mult,
|
||||||
|
n_head=n_head,
|
||||||
|
n_layer=n_layer,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(model_plus: 'ModelPlus') -> 'Params':
|
||||||
|
orig_config_path = model_plus.paths[0].parent / "params.json"
|
||||||
|
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
|
||||||
|
|
||||||
|
if hf_transformer_config_path.exists():
|
||||||
|
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
|
||||||
|
else:
|
||||||
|
params = Params.guessed(model_plus.model)
|
||||||
|
|
||||||
|
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab:
|
class SentencePieceVocab:
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
||||||
|
@ -595,18 +643,17 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
|
||||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
|
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
||||||
out: LazyModel = {}
|
out: LazyModel = {}
|
||||||
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
|
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
|
||||||
out["norm.weight"] = model["model.norm.weight"]
|
out["norm.weight"] = model["model.norm.weight"]
|
||||||
out["output.weight"] = model["lm_head.weight"]
|
out["output.weight"] = model["lm_head.weight"]
|
||||||
|
|
||||||
n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
|
|
||||||
for i in itertools.count():
|
for i in itertools.count():
|
||||||
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
|
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
|
||||||
break
|
break
|
||||||
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
|
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
|
||||||
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
|
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
|
||||||
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||||
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
||||||
|
|
||||||
|
@ -920,7 +967,7 @@ class OutputFile:
|
||||||
def __init__(self, fname_out: Path) -> None:
|
def __init__(self, fname_out: Path) -> None:
|
||||||
self.fout = open(fname_out, "wb")
|
self.fout = open(fname_out, "wb")
|
||||||
|
|
||||||
def write_file_header(self, params: Params) -> None:
|
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
|
||||||
self.fout.write(b"ggjt"[::-1]) # magic
|
self.fout.write(b"ggjt"[::-1]) # magic
|
||||||
values = [
|
values = [
|
||||||
1, # file version
|
1, # file version
|
||||||
|
@ -930,7 +977,7 @@ class OutputFile:
|
||||||
params.n_head,
|
params.n_head,
|
||||||
params.n_layer,
|
params.n_layer,
|
||||||
params.n_embd // params.n_head, # rot (obsolete)
|
params.n_embd // params.n_head, # rot (obsolete)
|
||||||
params.file_type.value,
|
file_type.value,
|
||||||
]
|
]
|
||||||
self.fout.write(struct.pack("i" * len(values), *values))
|
self.fout.write(struct.pack("i" * len(values), *values))
|
||||||
|
|
||||||
|
@ -951,17 +998,17 @@ class OutputFile:
|
||||||
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
|
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
|
||||||
n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
|
n_head=1, n_layer=0)
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
of.write_file_header(params)
|
of.write_file_header(params, file_type=GGMLFileType.AllF32)
|
||||||
of.write_vocab(vocab)
|
of.write_vocab(vocab)
|
||||||
of.fout.close()
|
of.fout.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
|
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
of = OutputFile(fname_out)
|
of = OutputFile(fname_out)
|
||||||
of.write_file_header(params)
|
of.write_file_header(params, file_type)
|
||||||
print("Writing vocab...")
|
print("Writing vocab...")
|
||||||
of.write_vocab(vocab)
|
of.write_vocab(vocab)
|
||||||
|
|
||||||
|
@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def do_necessary_conversions(model: LazyModel) -> LazyModel:
|
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
|
||||||
model = handle_quantization(model)
|
model = handle_quantization(model)
|
||||||
|
|
||||||
if "lm_head.weight" in model:
|
if "lm_head.weight" in model:
|
||||||
model = convert_transformers_to_orig(model)
|
model = convert_transformers_to_orig(model, params)
|
||||||
model = filter_and_sort_tensors(model)
|
model = filter_and_sort_tensors(model)
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
|
||||||
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
||||||
|
|
||||||
|
|
||||||
def default_outfile(model_paths: List[Path], params: Params) -> Path:
|
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
|
||||||
namestr = {
|
namestr = {
|
||||||
GGMLFileType.AllF32: "f32",
|
GGMLFileType.AllF32: "f32",
|
||||||
GGMLFileType.MostlyF16: "f16",
|
GGMLFileType.MostlyF16: "f16",
|
||||||
GGMLFileType.MostlyQ4_0: "q4_0",
|
GGMLFileType.MostlyQ4_0: "q4_0",
|
||||||
GGMLFileType.MostlyQ4_1: "q4_1",
|
GGMLFileType.MostlyQ4_1: "q4_1",
|
||||||
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
||||||
}[params.file_type]
|
}[file_type]
|
||||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
|
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
|
||||||
if ret in model_paths:
|
if ret in model_paths:
|
||||||
sys.stderr.write(
|
sys.stderr.write(
|
||||||
|
@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||||
else:
|
else:
|
||||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||||
vocab = load_vocab(vocab_dir)
|
vocab = load_vocab(vocab_dir)
|
||||||
|
params = Params.load(model_plus)
|
||||||
model = model_plus.model
|
model = model_plus.model
|
||||||
model = do_necessary_conversions(model)
|
model = do_necessary_conversions(model, params)
|
||||||
output_type = pick_output_type(model, args.outtype)
|
output_type = pick_output_type(model, args.outtype)
|
||||||
model = convert_to_output_type(model, output_type)
|
model = convert_to_output_type(model, output_type)
|
||||||
params = Params.guessed(model, output_type)
|
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
|
||||||
outfile = args.outfile or default_outfile(model_plus.paths, params)
|
OutputFile.write_all(outfile, params, output_type, model, vocab)
|
||||||
OutputFile.write_all(outfile, params, model, vocab)
|
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -541,7 +541,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
|
||||||
auto lparams = llama_context_default_params();
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
lparams.n_ctx = params.n_ctx;
|
lparams.n_ctx = params.n_ctx;
|
||||||
|
@ -557,25 +557,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||||
lparams.logits_all = params.perplexity;
|
lparams.logits_all = params.perplexity;
|
||||||
lparams.embedding = params.embedding;
|
lparams.embedding = params.embedding;
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||||
|
if (model == NULL) {
|
||||||
if (lctx == NULL) {
|
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
return NULL;
|
return std::make_tuple(nullptr, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * lctx = llama_new_context_with_model(model, lparams);
|
||||||
|
if (lctx == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
|
return std::make_tuple(nullptr, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lora_adapter.empty()) {
|
if (!params.lora_adapter.empty()) {
|
||||||
int err = llama_apply_lora_from_file(lctx,
|
int err = llama_model_apply_lora_from_file(model,
|
||||||
params.lora_adapter.c_str(),
|
params.lora_adapter.c_str(),
|
||||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
||||||
params.n_threads);
|
params.n_threads);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
return NULL;
|
llama_free(lctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
return std::make_tuple(nullptr, nullptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return lctx;
|
return std::make_tuple(model, lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void console_init(console_state & con_st) {
|
void console_init(console_state & con_st) {
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <tuple>
|
||||||
|
|
||||||
#if !defined (_WIN32)
|
#if !defined (_WIN32)
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
@ -96,7 +97,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
|
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Console utils
|
// Console utils
|
||||||
|
|
|
@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend();
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
ctx = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (ctx == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -111,12 +111,13 @@ int main(int argc, char ** argv) {
|
||||||
ggml_numa_init();
|
ggml_numa_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
ctx = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (ctx == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -143,6 +144,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -151,6 +153,7 @@ int main(int argc, char ** argv) {
|
||||||
if (params.export_cgraph) {
|
if (params.export_cgraph) {
|
||||||
llama_eval_export(ctx, "llama.ggml");
|
llama_eval_export(ctx, "llama.ggml");
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -670,6 +673,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend();
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
ctx = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (ctx == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "Loading model\n");
|
fprintf(stderr, "Loading model\n");
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_time_us();
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -330,10 +331,18 @@ int main(int argc, char ** argv) {
|
||||||
lparams.f16_kv = false;
|
lparams.f16_kv = false;
|
||||||
lparams.use_mlock = false;
|
lparams.use_mlock = false;
|
||||||
|
|
||||||
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
||||||
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
included_layers++;
|
included_layers++;
|
||||||
|
@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
// report timing
|
// report timing
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_time_us();
|
||||||
|
|
|
@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
|
||||||
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
||||||
|
|
||||||
// init
|
// init
|
||||||
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
|
auto model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||||
|
if (model == nullptr) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
auto ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
if (ctx == nullptr) {
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
auto tokens = std::vector<llama_token>(params.n_ctx);
|
auto tokens = std::vector<llama_token>(params.n_ctx);
|
||||||
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
|
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
|
||||||
|
|
||||||
if (n_prompt_tokens < 1) {
|
if (n_prompt_tokens < 1) {
|
||||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
|
||||||
printf("%s", next_token_str);
|
printf("%s", next_token_str);
|
||||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
|
@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
|
||||||
// free old model
|
// free old context
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
// load new model
|
// make new context
|
||||||
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
|
auto ctx2 = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
// Load state (rng, logits, embedding and kv_cache) from file
|
// Load state (rng, logits, embedding and kv_cache) from file
|
||||||
{
|
{
|
||||||
FILE *fp_read = fopen("dump_state.bin", "rb");
|
FILE *fp_read = fopen("dump_state.bin", "rb");
|
||||||
if (state_size != llama_get_state_size(ctx2)) {
|
if (state_size != llama_get_state_size(ctx2)) {
|
||||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
||||||
if (ret != state_size) {
|
if (ret != state_size) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
|
||||||
printf("%s", next_token_str);
|
printf("%s", next_token_str);
|
||||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
|
@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
printf("\n\n");
|
printf("\n\n");
|
||||||
|
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ Command line options:
|
||||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
||||||
- `--port`: Set the port to listen. Default: `8080`.
|
- `--port`: Set the port to listen. Default: `8080`.
|
||||||
|
- `--embedding`: Enable embedding extraction, Default: disabled.
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
|
@ -119,14 +120,14 @@ node .
|
||||||
|
|
||||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
||||||
|
|
||||||
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity).
|
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
||||||
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||||
|
|
||||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||||
|
|
||||||
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
|
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
|
||||||
|
|
||||||
`stop`: Specify a JSON array of stopping strings.
|
`stop`: Specify a JSON array of stopping strings.
|
||||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
||||||
|
@ -163,6 +164,14 @@ node .
|
||||||
|
|
||||||
`content`: Set the text to tokenize.
|
`content`: Set the text to tokenize.
|
||||||
|
|
||||||
|
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||||
|
|
||||||
|
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`content`: Set the text to process.
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
|
|
@ -115,6 +115,7 @@ struct llama_server_context {
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> last_n_tokens;
|
std::vector<llama_token> last_n_tokens;
|
||||||
|
|
||||||
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx = nullptr;
|
llama_context * ctx = nullptr;
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
|
@ -130,6 +131,10 @@ struct llama_server_context {
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
ctx = nullptr;
|
ctx = nullptr;
|
||||||
}
|
}
|
||||||
|
if (model) {
|
||||||
|
llama_free_model(model);
|
||||||
|
model = nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void rewind() {
|
void rewind() {
|
||||||
|
@ -150,8 +155,8 @@ struct llama_server_context {
|
||||||
|
|
||||||
bool loadModel(const gpt_params & params_) {
|
bool loadModel(const gpt_params & params_) {
|
||||||
params = params_;
|
params = params_;
|
||||||
ctx = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (ctx == nullptr) {
|
if (model == nullptr) {
|
||||||
LOG_ERROR("unable to load model", { { "model", params_.model } });
|
LOG_ERROR("unable to load model", { { "model", params_.model } });
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -254,6 +259,11 @@ struct llama_server_context {
|
||||||
n_past += n_eval;
|
n_past += n_eval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.n_predict == 0) {
|
||||||
|
has_next_token = false;
|
||||||
|
return llama_token_eos();
|
||||||
|
}
|
||||||
|
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||||
|
@ -315,10 +325,10 @@ struct llama_server_context {
|
||||||
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||||
} else {
|
} else {
|
||||||
// Temperature sampling
|
// Temperature sampling
|
||||||
|
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
|
||||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||||
id = llama_sample_token(ctx, &candidates_p);
|
id = llama_sample_token(ctx, &candidates_p);
|
||||||
}
|
}
|
||||||
|
@ -419,6 +429,19 @@ struct llama_server_context {
|
||||||
|
|
||||||
return token_text;
|
return token_text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<float> getEmbedding() {
|
||||||
|
static const int n_embd = llama_n_embd(ctx);
|
||||||
|
if (!params.embedding) {
|
||||||
|
LOG_WARNING("embedding disabled", {
|
||||||
|
{ "params.embedding", params.embedding },
|
||||||
|
});
|
||||||
|
return std::vector<float>(n_embd, 0.0f);
|
||||||
|
}
|
||||||
|
const float * data = llama_get_embeddings(ctx);
|
||||||
|
std::vector<float> embedding(data, data + n_embd);
|
||||||
|
return embedding;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void server_print_usage(const char * argv0, const gpt_params & params,
|
static void server_print_usage(const char * argv0, const gpt_params & params,
|
||||||
|
@ -457,6 +480,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params,
|
||||||
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||||
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
|
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||||
fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
||||||
|
fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -603,6 +627,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
params.use_mlock = true;
|
params.use_mlock = true;
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
|
} else if (arg == "--embedding") {
|
||||||
|
params.embedding = true;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
server_print_usage(argv[0], default_params, default_sparams);
|
server_print_usage(argv[0], default_params, default_sparams);
|
||||||
|
@ -646,6 +672,12 @@ static json format_generation_settings(llama_server_context & llama) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static json format_embedding_response(llama_server_context & llama) {
|
||||||
|
return json {
|
||||||
|
{ "embedding", llama.getEmbedding() },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
static json format_final_response(llama_server_context & llama, const std::string & content) {
|
static json format_final_response(llama_server_context & llama, const std::string & content) {
|
||||||
return json {
|
return json {
|
||||||
{ "content", content },
|
{ "content", content },
|
||||||
|
@ -881,12 +913,27 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
const std::string content = body["content"].get<std::string>();
|
const std::string content = body.value("content", "");
|
||||||
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
||||||
const json data = format_tokenizer_response(tokens);
|
const json data = format_tokenizer_response(tokens);
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
svr.Post("/embedding", [&llama](const Request & req, Response & res) {
|
||||||
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
|
llama.rewind();
|
||||||
|
llama_reset_timings(llama.ctx);
|
||||||
|
llama.params.prompt = body.value("content", "");
|
||||||
|
llama.params.n_predict = 0;
|
||||||
|
llama.loadPrompt();
|
||||||
|
llama.beginCompletion();
|
||||||
|
llama.doCompletion();
|
||||||
|
|
||||||
|
const json data = format_embedding_response(llama);
|
||||||
|
return res.set_content(data.dump(), "application/json");
|
||||||
|
});
|
||||||
|
|
||||||
svr.set_logger(log_server_request);
|
svr.set_logger(log_server_request);
|
||||||
|
|
||||||
svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
|
svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
|
||||||
|
|
|
@ -68,11 +68,12 @@ int main(int argc, char ** argv)
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend();
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
ctx = llama_init_from_gpt_params( params );
|
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
||||||
|
|
||||||
if ( ctx == NULL )
|
if ( model == NULL )
|
||||||
{
|
{
|
||||||
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -170,6 +171,7 @@ int main(int argc, char ** argv)
|
||||||
} // wend of main loop
|
} // wend of main loop
|
||||||
|
|
||||||
llama_free( ctx );
|
llama_free( ctx );
|
||||||
|
llama_free_model( model );
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) {
|
||||||
struct llama_context_params llama_params = llama_context_default_params();
|
struct llama_context_params llama_params = llama_context_default_params();
|
||||||
llama_params.vocab_only = true;
|
llama_params.vocab_only = true;
|
||||||
|
|
||||||
struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
|
struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
|
||||||
|
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
||||||
|
|
||||||
struct llama_vocab vocab;
|
struct llama_vocab vocab;
|
||||||
{
|
{
|
||||||
|
@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) {
|
||||||
delete[] compute_addr;
|
delete[] compute_addr;
|
||||||
delete[] compute_buf_0;
|
delete[] compute_buf_0;
|
||||||
delete[] compute_buf_1;
|
delete[] compute_buf_1;
|
||||||
|
llama_free(lctx);
|
||||||
|
llama_free_model(lmodel);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
48
flake.nix
48
flake.nix
|
@ -9,27 +9,33 @@
|
||||||
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
||||||
inherit (pkgs.lib) optionals;
|
inherit (pkgs.lib) optionals;
|
||||||
isM1 = isAarch64 && isDarwin;
|
isM1 = isAarch64 && isDarwin;
|
||||||
osSpecific =
|
osSpecific = if isM1 then
|
||||||
if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
|
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||||
else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
|
Accelerate
|
||||||
else [ ];
|
MetalKit
|
||||||
pkgs = import nixpkgs {
|
MetalPerformanceShaders
|
||||||
inherit system;
|
MetalPerformanceShadersGraph
|
||||||
};
|
]
|
||||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
else if isDarwin then
|
||||||
numpy
|
with pkgs.darwin.apple_sdk.frameworks; [
|
||||||
sentencepiece
|
Accelerate
|
||||||
]);
|
CoreGraphics
|
||||||
in
|
CoreVideo
|
||||||
{
|
]
|
||||||
|
else
|
||||||
|
[ ];
|
||||||
|
pkgs = import nixpkgs { inherit system; };
|
||||||
|
llama-python =
|
||||||
|
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
|
in {
|
||||||
packages.default = pkgs.stdenv.mkDerivation {
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
name = "llama.cpp";
|
name = "llama.cpp";
|
||||||
src = ./.;
|
src = ./.;
|
||||||
postPatch =
|
postPatch = if isM1 then ''
|
||||||
if isM1 then ''
|
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
'' else "";
|
'' else
|
||||||
|
"";
|
||||||
nativeBuildInputs = with pkgs; [ cmake ];
|
nativeBuildInputs = with pkgs; [ cmake ];
|
||||||
buildInputs = osSpecific;
|
buildInputs = osSpecific;
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
||||||
|
@ -62,11 +68,7 @@
|
||||||
};
|
};
|
||||||
apps.default = self.apps.${system}.llama;
|
apps.default = self.apps.${system}.llama;
|
||||||
devShells.default = pkgs.mkShell {
|
devShells.default = pkgs.mkShell {
|
||||||
packages = with pkgs; [
|
packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
|
||||||
cmake
|
|
||||||
llama-python
|
|
||||||
] ++ osSpecific;
|
|
||||||
};
|
};
|
||||||
}
|
});
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
373
ggml-cuda.cu
373
ggml-cuda.cu
|
@ -117,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
|
||||||
|
|
||||||
//================================= k-quants
|
//================================= k-quants
|
||||||
|
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
|
#define QK_K 64
|
||||||
|
#define K_SCALE_SIZE 4
|
||||||
|
#else
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
|
#define K_SCALE_SIZE 12
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||||
|
@ -128,13 +134,25 @@ typedef struct {
|
||||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t hmask[QK_K/8];
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||||
uint8_t qs[QK_K/4]; // nibbles / quants
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||||
uint8_t scales[3*QK_K/64];
|
#ifdef GGML_QKK_64
|
||||||
half d;
|
uint8_t scales[2]; // scales, quantized with 8 bits
|
||||||
|
#else
|
||||||
|
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
||||||
|
#endif
|
||||||
|
half d; // super-block scale
|
||||||
} block_q3_K;
|
} block_q3_K;
|
||||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
||||||
|
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
|
typedef struct {
|
||||||
|
half d[2]; // super-block scales/mins
|
||||||
|
uint8_t scales[2]; // 4-bit block scales/mins
|
||||||
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
|
} block_q4_K;
|
||||||
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||||
|
#else
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
half dmin; // super-block scale for quantized mins
|
half dmin; // super-block scale for quantized mins
|
||||||
|
@ -142,15 +160,26 @@ typedef struct {
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_K;
|
} block_q4_K;
|
||||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale
|
||||||
half dmin; // super-block scale for quantized mins
|
int8_t scales[QK_K/16]; // block scales
|
||||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
} block_q5_K;
|
} block_q5_K;
|
||||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
||||||
|
#else
|
||||||
|
typedef struct {
|
||||||
|
half d; // super-block scale for quantized scales
|
||||||
|
half dmin; // super-block scale for quantized mins
|
||||||
|
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||||
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
|
} block_q5_K;
|
||||||
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||||
|
@ -349,13 +378,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
||||||
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
#if QK_K == 256
|
||||||
const int n = tid/32;
|
const int n = tid/32;
|
||||||
const int l = tid - 32*n;
|
const int l = tid - 32*n;
|
||||||
const int is = 8*n + l/16;
|
const int is = 8*n + l/16;
|
||||||
|
|
||||||
const block_q2_K * x = (const block_q2_K *) vx;
|
|
||||||
|
|
||||||
const uint8_t q = x[i].qs[32*n + l];
|
const uint8_t q = x[i].qs[32*n + l];
|
||||||
float * y = yy + i*QK_K + 128*n;
|
float * y = yy + i*QK_K + 128*n;
|
||||||
|
|
||||||
|
@ -365,21 +395,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||||
|
#else
|
||||||
|
const int is = tid/16; // 0 or 1
|
||||||
|
const int il = tid%16; // 0...15
|
||||||
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||||
|
float * y = yy + i*QK_K + 16*is + il;
|
||||||
|
float dall = x[i].d;
|
||||||
|
float dmin = x[i].dmin;
|
||||||
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||||
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
int r = threadIdx.x/4;
|
const int i = blockIdx.x;
|
||||||
int i = blockIdx.x;
|
|
||||||
int tid = r/2;
|
|
||||||
int is0 = r%2;
|
|
||||||
int l0 = 16*is0 + 4*(threadIdx.x%4);
|
|
||||||
int n = tid / 4;
|
|
||||||
int j = tid - 4*n;
|
|
||||||
|
|
||||||
const block_q3_K * x = (const block_q3_K *) vx;
|
const block_q3_K * x = (const block_q3_K *) vx;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
const int r = threadIdx.x/4;
|
||||||
|
const int tid = r/2;
|
||||||
|
const int is0 = r%2;
|
||||||
|
const int l0 = 16*is0 + 4*(threadIdx.x%4);
|
||||||
|
const int n = tid / 4;
|
||||||
|
const int j = tid - 4*n;
|
||||||
|
|
||||||
uint8_t m = 1 << (4*n + j);
|
uint8_t m = 1 << (4*n + j);
|
||||||
int is = 8*n + 2*j + is0;
|
int is = 8*n + 2*j + is0;
|
||||||
int shift = 2*j;
|
int shift = 2*j;
|
||||||
|
@ -396,9 +437,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
||||||
const uint8_t * hm = x[i].hmask;
|
const uint8_t * hm = x[i].hmask;
|
||||||
|
|
||||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||||
|
#else
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int is = tid/16; // 0 or 1
|
||||||
|
const int il = tid%16; // 0...15
|
||||||
|
const int im = il/8; // 0...1
|
||||||
|
const int in = il%8; // 0...7
|
||||||
|
|
||||||
|
float * y = yy + i*QK_K + 16*is + il;
|
||||||
|
|
||||||
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||||
|
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
||||||
|
const float d = (float)x[i].d;
|
||||||
|
|
||||||
|
if (is == 0) {
|
||||||
|
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||||
|
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||||
|
} else {
|
||||||
|
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||||
|
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||||
if (j < 4) {
|
if (j < 4) {
|
||||||
d = q[j] & 63; m = q[j + 4] & 63;
|
d = q[j] & 63; m = q[j + 4] & 63;
|
||||||
|
@ -407,19 +470,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
||||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
||||||
const block_q4_K * x = (const block_q4_K *) vx;
|
const block_q4_K * x = (const block_q4_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
//// assume 64 threads - this is very slightly better than the one below
|
#if QK_K == 256
|
||||||
//const int tid = threadIdx.x;
|
|
||||||
//const int il = tid/16;
|
|
||||||
//const int ir = tid%16;
|
|
||||||
//const int is = 2*il;
|
|
||||||
//const int n = 2;
|
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int il = tid/8;
|
const int il = tid/8;
|
||||||
|
@ -443,6 +501,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
||||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const uint8_t * q = x[i].qs;
|
||||||
|
float * y = yy + i*QK_K;
|
||||||
|
const float d = (float)x[i].d[0];
|
||||||
|
const float m = (float)x[i].d[1];
|
||||||
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
||||||
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||||
|
@ -450,6 +517,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int il = tid/16; // il is in 0...3
|
const int il = tid/16; // il is in 0...3
|
||||||
|
@ -476,12 +544,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||||
hm <<= 1;
|
hm <<= 1;
|
||||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||||
|
#else
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const uint8_t q = x[i].qs[tid];
|
||||||
|
const int im = tid/8; // 0...3
|
||||||
|
const int in = tid%8; // 0...7
|
||||||
|
const int is = tid/16; // 0 or 1
|
||||||
|
const uint8_t h = x[i].qh[in] >> im;
|
||||||
|
const float d = x[i].d;
|
||||||
|
float * y = yy + i*QK_K + tid;
|
||||||
|
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
||||||
|
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
#if QK_K == 256
|
||||||
|
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
@ -501,6 +582,24 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||||
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||||
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||||
|
#else
|
||||||
|
|
||||||
|
// assume 32 threads
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int ip = tid/16; // 0 or 1
|
||||||
|
const int il = tid - 16*ip; // 0...15
|
||||||
|
|
||||||
|
float * y = yy + i*QK_K + 16*ip + il;
|
||||||
|
|
||||||
|
const float d = x[i].d;
|
||||||
|
|
||||||
|
const uint8_t ql = x[i].ql[16*ip + il];
|
||||||
|
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
||||||
|
const int8_t * sc = x[i].scales;
|
||||||
|
|
||||||
|
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
||||||
|
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
||||||
|
@ -515,6 +614,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
|
|
||||||
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
||||||
|
|
||||||
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||||
|
|
||||||
|
@ -528,8 +630,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
const int s_offset = 8*im;
|
const int s_offset = 8*im;
|
||||||
const int y_offset = 128*im + l0;
|
const int y_offset = 128*im + l0;
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
|
||||||
|
|
||||||
uint32_t aux[4];
|
uint32_t aux[4];
|
||||||
const uint8_t * d = (const uint8_t *)aux;
|
const uint8_t * d = (const uint8_t *)aux;
|
||||||
const uint8_t * m = (const uint8_t *)(aux + 2);
|
const uint8_t * m = (const uint8_t *)(aux + 2);
|
||||||
|
@ -565,6 +665,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
tmp += dall * sum1 - dmin * sum2;
|
tmp += dall * sum1 - dmin * sum2;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||||
|
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||||
|
const int offset = tid * K_QUANTS_PER_ITERATION;
|
||||||
|
|
||||||
|
uint32_t uaux[2];
|
||||||
|
const uint8_t * d = (const uint8_t *)uaux;
|
||||||
|
|
||||||
|
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||||
|
|
||||||
|
const float * y = yy + i * QK_K + offset;
|
||||||
|
const uint8_t * q = x[i].qs + offset;
|
||||||
|
const uint32_t * s = (const uint32_t *)x[i].scales;
|
||||||
|
|
||||||
|
uaux[0] = s[0] & 0x0f0f0f0f;
|
||||||
|
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
||||||
|
|
||||||
|
const half2 * dh = (const half2 *)&x[i].d;
|
||||||
|
|
||||||
|
const float2 dall = __half22float2(dh[0]);
|
||||||
|
|
||||||
|
float sum1 = 0, sum2 = 0;
|
||||||
|
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||||
|
const uint8_t ql = q[l];
|
||||||
|
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
||||||
|
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
||||||
|
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
||||||
|
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
||||||
|
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
||||||
|
}
|
||||||
|
tmp += dall.x * sum1 - dall.y * sum2;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -573,16 +706,13 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid == 0) {
|
if (threadIdx.x == 0) {
|
||||||
dst[row] = tmp;
|
dst[row] = tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x0303;
|
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
|
|
||||||
|
@ -591,6 +721,13 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
|
|
||||||
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
||||||
|
|
||||||
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
|
||||||
|
const uint16_t kmask1 = 0x0303;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
|
||||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||||
|
|
||||||
|
@ -610,8 +747,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
|
|
||||||
const uint16_t s_shift = 4*im;
|
const uint16_t s_shift = 4*im;
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||||
|
|
||||||
const float * y = yy + i * QK_K + y_offset;
|
const float * y = yy + i * QK_K + y_offset;
|
||||||
|
@ -640,6 +775,34 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
tmp += d * sum;
|
tmp += d * sum;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
|
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||||
|
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||||
|
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
||||||
|
const int in = offset/8; // 0 or 1
|
||||||
|
const int im = offset%8; // 0...7
|
||||||
|
|
||||||
|
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||||
|
|
||||||
|
const float * y = yy + i * QK_K + offset;
|
||||||
|
const uint8_t * q = x[i].qs + offset;
|
||||||
|
const uint8_t * s = x[i].scales;
|
||||||
|
|
||||||
|
const float dall = (float)x[i].d;
|
||||||
|
|
||||||
|
float sum = 0;
|
||||||
|
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||||
|
const uint8_t hl = x[i].hmask[im+l] >> in;
|
||||||
|
const uint8_t ql = q[l];
|
||||||
|
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
||||||
|
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
||||||
|
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
||||||
|
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
||||||
|
}
|
||||||
|
tmp += sum;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -648,22 +811,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid == 0) {
|
if (threadIdx.x == 0) {
|
||||||
dst[row] = tmp;
|
dst[row] = tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|
||||||
|
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
|
||||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||||
|
|
||||||
|
@ -683,8 +849,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||||
uint16_t aux[4];
|
uint16_t aux[4];
|
||||||
const uint8_t * sc = (const uint8_t *)aux;
|
const uint8_t * sc = (const uint8_t *)aux;
|
||||||
|
|
||||||
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||||
|
@ -713,6 +877,36 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||||
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||||
|
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
||||||
|
|
||||||
|
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||||
|
|
||||||
|
uint16_t aux16[2];
|
||||||
|
const uint8_t * s = (const uint8_t *)aux16;
|
||||||
|
|
||||||
|
float tmp = 0;
|
||||||
|
|
||||||
|
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||||
|
const uint8_t * q = x[i].qs + step;
|
||||||
|
const float * y = yy + i*QK_K + step;
|
||||||
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||||
|
aux16[0] = a[0] & 0x0f0f;
|
||||||
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||||
|
const float d = (float)x[i].d[0];
|
||||||
|
const float m = (float)x[i].d[1];
|
||||||
|
float sum = 0.f;
|
||||||
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||||
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
||||||
|
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
||||||
|
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
||||||
|
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
||||||
|
}
|
||||||
|
tmp += sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -728,15 +922,19 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
|
||||||
|
|
||||||
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
||||||
const int row = blockIdx.x;
|
const int row = blockIdx.x;
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|
||||||
|
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
||||||
|
|
||||||
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
|
||||||
const int tid = threadIdx.x/2; // 0...15
|
const int tid = threadIdx.x/2; // 0...15
|
||||||
const int ix = threadIdx.x%2;
|
const int ix = threadIdx.x%2;
|
||||||
|
|
||||||
|
@ -757,10 +955,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||||
uint16_t aux[4];
|
uint16_t aux[4];
|
||||||
const uint8_t * sc = (const uint8_t *)aux;
|
const uint8_t * sc = (const uint8_t *)aux;
|
||||||
|
|
||||||
const block_q5_K * x = (const block_q5_K *)vx + ib0;
|
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||||
|
|
||||||
const uint8_t * ql1 = x[i].qs + q_offset;
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
||||||
|
@ -793,9 +987,32 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||||
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
||||||
}
|
}
|
||||||
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||||
|
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
||||||
|
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||||
|
const int im = step/8;
|
||||||
|
const int in = step%8;
|
||||||
|
|
||||||
|
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||||
|
const uint8_t * q = x[i].qs + step;
|
||||||
|
const int8_t * s = x[i].scales;
|
||||||
|
const float * y = yy + i*QK_K + step;
|
||||||
|
const float d = x[i].d;
|
||||||
|
float sum = 0.f;
|
||||||
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||||
|
const uint8_t h = x[i].qh[in+j] >> im;
|
||||||
|
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
||||||
|
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
||||||
|
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
||||||
|
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
||||||
|
}
|
||||||
|
tmp += sum;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -803,7 +1020,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid == 0) {
|
if (threadIdx.x == 0) {
|
||||||
dst[row] = tmp;
|
dst[row] = tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -820,6 +1037,8 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
||||||
|
|
||||||
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
|
||||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||||
|
|
||||||
|
@ -874,6 +1093,37 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
|
||||||
|
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
|
||||||
|
|
||||||
|
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||||
|
|
||||||
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
|
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||||
|
|
||||||
|
const float * y = yy + i * QK_K + step;
|
||||||
|
const uint8_t * ql = x[i].ql + step;
|
||||||
|
const uint8_t * qh = x[i].qh + step;
|
||||||
|
const int8_t * s = x[i].scales;
|
||||||
|
|
||||||
|
const float d = x[i+0].d;
|
||||||
|
|
||||||
|
float sum = 0;
|
||||||
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||||
|
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
||||||
|
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
||||||
|
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
||||||
|
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
||||||
|
}
|
||||||
|
tmp += sum;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -1252,12 +1502,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
|
||||||
|
|
||||||
static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
#if QK_K == 256
|
||||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
#else
|
||||||
|
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
#if QK_K == 256
|
||||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
#else
|
||||||
|
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
|
@ -1267,12 +1525,20 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu
|
||||||
|
|
||||||
static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
#if QK_K == 256
|
||||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
#else
|
||||||
|
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
#if QK_K == 256
|
||||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
#else
|
||||||
|
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
@ -2553,6 +2819,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
||||||
|
|
||||||
tensor->backend = GGML_BACKEND_GPU;
|
tensor->backend = GGML_BACKEND_GPU;
|
||||||
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
||||||
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
||||||
tensor->op == GGML_OP_VIEW;
|
tensor->op == GGML_OP_VIEW;
|
||||||
|
@ -2635,7 +2902,7 @@ void ggml_cuda_free_scratch() {
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
||||||
ggml_cuda_func_t func;
|
ggml_cuda_func_t func;
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||||
|| tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
|
|| (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
|
||||||
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
|
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
|
|
66
ggml-metal.m
66
ggml-metal.m
|
@ -51,21 +51,21 @@ struct ggml_metal_context {
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q2_k);
|
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q3_k);
|
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_k);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q5_k);
|
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q6_k);
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||||
GGML_METAL_DECL_KERNEL(norm);
|
GGML_METAL_DECL_KERNEL(norm);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
|
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
||||||
GGML_METAL_DECL_KERNEL(rope);
|
GGML_METAL_DECL_KERNEL(rope);
|
||||||
GGML_METAL_DECL_KERNEL(alibi_f32);
|
GGML_METAL_DECL_KERNEL(alibi_f32);
|
||||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||||
|
@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
|
MTLCompileOptions* options = [MTLCompileOptions new];
|
||||||
|
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
||||||
|
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||||
|
#else
|
||||||
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
||||||
|
#endif
|
||||||
if (error) {
|
if (error) {
|
||||||
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q2_k);
|
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q3_k);
|
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_k);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q5_k);
|
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q6_k);
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||||
GGML_METAL_ADD_KERNEL(norm);
|
GGML_METAL_ADD_KERNEL(norm);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
|
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(rope);
|
GGML_METAL_ADD_KERNEL(rope);
|
||||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||||
|
@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 4;
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
{
|
{
|
||||||
|
@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 4;
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
{
|
{
|
||||||
|
@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 4;
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
{
|
{
|
||||||
|
@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 4;
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
|
@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
nth0 = 4;
|
nth0 = 4;
|
||||||
nth1 = 16;
|
nth1 = 16;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
|
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
||||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
|
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
||||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
||||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
||||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
414
ggml-metal.metal
414
ggml-metal.metal
|
@ -428,7 +428,7 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||||
}
|
}
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||||
dst[r1*ne0 + r0] = sum[0];
|
dst[r1*ne0 + r0] = sum[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -497,7 +497,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||||
}
|
}
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||||
dst[r1*ne0 + r0] = sum[0];
|
dst[r1*ne0 + r0] = sum[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -775,47 +775,76 @@ kernel void kernel_cpy_f32_f32(
|
||||||
|
|
||||||
//============================================ k-quants ======================================================
|
//============================================ k-quants ======================================================
|
||||||
|
|
||||||
|
#ifndef QK_K
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
|
#else
|
||||||
|
static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
#define K_SCALE_SIZE 12
|
||||||
|
#else
|
||||||
|
#define K_SCALE_SIZE 4
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||||
uint8_t qs[QK_K/4]; // quants
|
uint8_t qs[QK_K/4]; // quants
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
half dmin; // super-block scale for quantized mins
|
half dmin; // super-block scale for quantized mins
|
||||||
} block_q2_k;
|
} block_q2_K;
|
||||||
// 84 bytes / block
|
// 84 bytes / block
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
#if QK_K == 64
|
||||||
|
uint8_t scales[2];
|
||||||
|
#else
|
||||||
|
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
||||||
|
#endif
|
||||||
half d; // super-block scale
|
half d; // super-block scale
|
||||||
} block_q3_k;
|
} block_q3_K;
|
||||||
// 110 bytes / block
|
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
typedef struct {
|
||||||
|
half d[2]; // super-block scales/mins
|
||||||
|
uint8_t scales[2];
|
||||||
|
uint8_t qs[QK_K/2]; // 4-bit quants
|
||||||
|
} block_q4_K;
|
||||||
|
#else
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
half dmin; // super-block scale for quantized mins
|
half dmin; // super-block scale for quantized mins
|
||||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_k;
|
} block_q4_K;
|
||||||
// 144 bytes / block
|
#endif
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
typedef struct {
|
||||||
|
half d; // super-block scales/mins
|
||||||
|
int8_t scales[QK_K/16]; // 8-bit block scales
|
||||||
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
|
} block_q5_K;
|
||||||
|
#else
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // super-block scale for quantized scales
|
half d; // super-block scale for quantized scales
|
||||||
half dmin; // super-block scale for quantized mins
|
half dmin; // super-block scale for quantized mins
|
||||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
} block_q5_k;
|
} block_q5_K;
|
||||||
// 176 bytes / block
|
// 176 bytes / block
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||||
half d; // super-block scale
|
half d; // super-block scale
|
||||||
} block_q6_k;
|
} block_q6_K;
|
||||||
// 210 bytes / block
|
// 210 bytes / block
|
||||||
|
|
||||||
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||||
|
@ -836,7 +865,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||||
|
|
||||||
//========================================== dequantization =============================
|
//========================================== dequantization =============================
|
||||||
|
|
||||||
static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, int k) {
|
static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
@ -847,6 +876,7 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
|
||||||
|
|
||||||
device const uint8_t * q = x[i].qs;
|
device const uint8_t * q = x[i].qs;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
int is = 0;
|
int is = 0;
|
||||||
float dl, ml;
|
float dl, ml;
|
||||||
for (int n = 0; n < QK_K; n += 128) {
|
for (int n = 0; n < QK_K; n += 128) {
|
||||||
|
@ -865,14 +895,29 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
|
||||||
}
|
}
|
||||||
q += 32;
|
q += 32;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
|
||||||
|
float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
|
||||||
|
float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
|
||||||
|
float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
|
||||||
|
for (int l = 0; l < 16; ++l) {
|
||||||
|
y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1;
|
||||||
|
y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2;
|
||||||
|
y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3;
|
||||||
|
y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4;
|
||||||
|
}
|
||||||
|
y += QK_K;
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) {
|
static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x0303;
|
const uint16_t kmask1 = 0x0303;
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
|
||||||
|
@ -918,22 +963,49 @@ static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, i
|
||||||
}
|
}
|
||||||
q += 32;
|
q += 32;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
const float d_all = (float)(x[i].d);
|
||||||
|
|
||||||
|
device const uint8_t * q = x[i].qs;
|
||||||
|
device const uint8_t * hm = x[i].hmask;
|
||||||
|
|
||||||
|
const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
|
||||||
|
const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
|
||||||
|
const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
|
||||||
|
const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
|
||||||
|
|
||||||
|
for (int l = 0; l < 8; ++l) {
|
||||||
|
uint8_t h = hm[l];
|
||||||
|
y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
|
||||||
|
y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
|
||||||
|
y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
|
||||||
|
y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
|
||||||
|
y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
|
||||||
|
y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
|
||||||
|
y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
|
||||||
|
y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
|
||||||
|
}
|
||||||
|
y += QK_K;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) {
|
||||||
|
|
||||||
static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) {
|
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
device const uint8_t * q = x[i].qs;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
const float d = x[i].d;
|
const float d = x[i].d;
|
||||||
const float min = x[i].dmin;
|
const float min = x[i].dmin;
|
||||||
|
|
||||||
device const uint8_t * q = x[i].qs;
|
|
||||||
device const uint8_t * scales = x[i].scales;
|
device const uint8_t * scales = x[i].scales;
|
||||||
|
|
||||||
int is = 0;
|
int is = 0;
|
||||||
|
@ -945,14 +1017,29 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i
|
||||||
for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
|
for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
|
||||||
q += 32; is += 2;
|
q += 32; is += 2;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
device const uint8_t * s = x[i].scales;
|
||||||
|
device const half2 * dh = (device const half2 *)x[i].d;
|
||||||
|
const float2 d = (float2)dh[0];
|
||||||
|
const float d1 = d[0] * (s[0] & 0xF);
|
||||||
|
const float d2 = d[0] * (s[1] & 0xF);
|
||||||
|
const float m1 = d[1] * (s[0] >> 4);
|
||||||
|
const float m2 = d[1] * (s[1] >> 4);
|
||||||
|
for (int l = 0; l < 32; ++l) {
|
||||||
|
y[l+ 0] = d1 * (q[l] & 0xF) - m1;
|
||||||
|
y[l+32] = d2 * (q[l] >> 4) - m2;
|
||||||
|
}
|
||||||
|
y += QK_K;
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) {
|
static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
const float d = (float)(x[i].d);
|
const float d = (float)(x[i].d);
|
||||||
|
@ -973,10 +1060,32 @@ static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, i
|
||||||
u1 <<= 2; u2 <<= 2;
|
u1 <<= 2; u2 <<= 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
const float d = (float)x[i].d;
|
||||||
|
|
||||||
|
device const uint8_t * ql = x[i].qs;
|
||||||
|
device const uint8_t * qh = x[i].qh;
|
||||||
|
device const int8_t * sc = x[i].scales;
|
||||||
|
|
||||||
|
for (int l = 0; l < 8; ++l) {
|
||||||
|
y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
|
||||||
|
y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
|
||||||
|
y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
|
||||||
|
y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
|
||||||
|
y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16));
|
||||||
|
y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16));
|
||||||
|
y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16));
|
||||||
|
y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16));
|
||||||
|
}
|
||||||
|
y += QK_K;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) {
|
static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
@ -988,6 +1097,7 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
|
||||||
|
|
||||||
const float d = x[i].d;
|
const float d = x[i].d;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
for (int n = 0; n < QK_K; n += 128) {
|
for (int n = 0; n < QK_K; n += 128) {
|
||||||
for (int l = 0; l < 32; ++l) {
|
for (int l = 0; l < 32; ++l) {
|
||||||
int is = l/16;
|
int is = l/16;
|
||||||
|
@ -1005,10 +1115,23 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
|
||||||
qh += 32;
|
qh += 32;
|
||||||
sc += 8;
|
sc += 8;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
for (int l = 0; l < 16; ++l) {
|
||||||
|
const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||||
|
const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||||
|
const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||||
|
const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||||
|
y[l+ 0] = d * sc[0] * q1;
|
||||||
|
y[l+16] = d * sc[1] * q2;
|
||||||
|
y[l+32] = d * sc[2] * q3;
|
||||||
|
y[l+48] = d * sc[3] * q4;
|
||||||
|
}
|
||||||
|
y += 64;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_get_rows_q2_k(
|
kernel void kernel_get_rows_q2_K(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int * src1,
|
device const int * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1019,12 +1142,12 @@ kernel void kernel_get_rows_q2_k(
|
||||||
const int i = tpig;
|
const int i = tpig;
|
||||||
const int r = ((device int32_t *) src1)[i];
|
const int r = ((device int32_t *) src1)[i];
|
||||||
|
|
||||||
dequantize_row_q2_k(
|
dequantize_row_q2_K(
|
||||||
(device const block_q2_k *) ((device char *) src0 + r*nb01),
|
(device const block_q2_K *) ((device char *) src0 + r*nb01),
|
||||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_get_rows_q3_k(
|
kernel void kernel_get_rows_q3_K(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int * src1,
|
device const int * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1035,12 +1158,12 @@ kernel void kernel_get_rows_q3_k(
|
||||||
const int i = tpig;
|
const int i = tpig;
|
||||||
const int r = ((device int32_t *) src1)[i];
|
const int r = ((device int32_t *) src1)[i];
|
||||||
|
|
||||||
dequantize_row_q3_k(
|
dequantize_row_q3_K(
|
||||||
(device const block_q3_k *) ((device char *) src0 + r*nb01),
|
(device const block_q3_K *) ((device char *) src0 + r*nb01),
|
||||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_get_rows_q4_k(
|
kernel void kernel_get_rows_q4_K(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int * src1,
|
device const int * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1051,12 +1174,12 @@ kernel void kernel_get_rows_q4_k(
|
||||||
const int i = tpig;
|
const int i = tpig;
|
||||||
const int r = ((device int32_t *) src1)[i];
|
const int r = ((device int32_t *) src1)[i];
|
||||||
|
|
||||||
dequantize_row_q4_k(
|
dequantize_row_q4_K(
|
||||||
(device const block_q4_k *) ((device char *) src0 + r*nb01),
|
(device const block_q4_K *) ((device char *) src0 + r*nb01),
|
||||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_get_rows_q5_k(
|
kernel void kernel_get_rows_q5_K(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int * src1,
|
device const int * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1067,12 +1190,12 @@ kernel void kernel_get_rows_q5_k(
|
||||||
const int i = tpig;
|
const int i = tpig;
|
||||||
const int r = ((device int32_t *) src1)[i];
|
const int r = ((device int32_t *) src1)[i];
|
||||||
|
|
||||||
dequantize_row_q5_k(
|
dequantize_row_q5_K(
|
||||||
(device const block_q5_k *) ((device char *) src0 + r*nb01),
|
(device const block_q5_K *) ((device char *) src0 + r*nb01),
|
||||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_get_rows_q6_k(
|
kernel void kernel_get_rows_q6_K(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const int * src1,
|
device const int * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1083,14 +1206,14 @@ kernel void kernel_get_rows_q6_k(
|
||||||
const int i = tpig;
|
const int i = tpig;
|
||||||
const int r = ((device int32_t *) src1)[i];
|
const int r = ((device int32_t *) src1)[i];
|
||||||
|
|
||||||
dequantize_row_q6_k(
|
dequantize_row_q6_K(
|
||||||
(device const block_q6_k *) ((device char *) src0 + r*nb01),
|
(device const block_q6_K *) ((device char *) src0 + r*nb01),
|
||||||
(device float *) ((device char *) dst + i*nb1), ne00);
|
(device float *) ((device char *) dst + i*nb1), ne00);
|
||||||
}
|
}
|
||||||
|
|
||||||
//====================================== dot products =========================
|
//====================================== dot products =========================
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q2_k_f32(
|
kernel void kernel_mul_mat_q2_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1107,12 +1230,15 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
const int64_t r0 = tgpig.x;
|
const int64_t r0 = tgpig.x;
|
||||||
const int64_t r1 = tgpig.y;
|
const int64_t r1 = tgpig.y;
|
||||||
|
|
||||||
device const block_q2_k * x = (device const block_q2_k *) src0 + r0*nb;
|
device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb;
|
||||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
const int nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
const int tid = tpitg.y; // 0...16
|
const int tid = tpitg.y; // 0...16
|
||||||
const int il = tid/4; // 0...3
|
const int il = tid/4; // 0...3
|
||||||
const int ir = tid%4; // 0...3
|
const int ir = tid%4; // 0...3
|
||||||
|
@ -1125,9 +1251,6 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
const int y_offset = 64*il + n*ir;
|
const int y_offset = 64*il + n*ir;
|
||||||
const int q_offset = 32*ip + n*ir;
|
const int q_offset = 32*ip + n*ir;
|
||||||
|
|
||||||
sum[ith] = 0.0f;
|
|
||||||
|
|
||||||
float sumf = 0;
|
|
||||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
device const uint8_t * q = x[i].qs + q_offset;
|
device const uint8_t * q = x[i].qs + q_offset;
|
||||||
|
@ -1140,7 +1263,6 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
|
|
||||||
device const float * y = yy + i*QK_K + y_offset;
|
device const float * y = yy + i*QK_K + y_offset;
|
||||||
|
|
||||||
//float4 s = {0.f, 0.f, 0.f, 0.f};
|
|
||||||
float2 s = {0.f, 0.f};
|
float2 s = {0.f, 0.f};
|
||||||
float smin = 0;
|
float smin = 0;
|
||||||
for (int l = 0; l < n; ++l) {
|
for (int l = 0; l < n; ++l) {
|
||||||
|
@ -1155,25 +1277,38 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;
|
sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
const int il = 4 * tpitg.x;
|
||||||
|
|
||||||
|
uint32_t aux[2];
|
||||||
|
thread const uint8_t * d = (thread const uint8_t *)aux;
|
||||||
|
thread const uint8_t * m = (thread const uint8_t *)aux + 4;
|
||||||
|
|
||||||
|
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||||
|
|
||||||
|
device const uint8_t * q = x[i].qs + il;
|
||||||
|
device const float * y = yy + i*QK_K + il;
|
||||||
|
|
||||||
|
const float dall = (float)x[i].d;
|
||||||
|
const float dmin = (float)x[i].dmin;
|
||||||
|
|
||||||
|
device const uint32_t * a = (device const uint32_t *)x[i].scales;
|
||||||
|
aux[0] = a[0] & 0x0f0f0f0f;
|
||||||
|
aux[1] = (a[0] >> 4) & 0x0f0f0f0f;
|
||||||
|
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0])
|
||||||
|
+ y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1])
|
||||||
|
+ y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2])
|
||||||
|
+ y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
sum[ith] = sumf;
|
sum[ith] = sumf;
|
||||||
|
|
||||||
//int mask1 = (ith%4 == 0);
|
|
||||||
//int mask2 = (ith%16 == 0);
|
|
||||||
|
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
//for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i];
|
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
//for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i];
|
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
//if (ith == 0) {
|
|
||||||
// for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
|
||||||
// dst[r1*ne0 + r0] = sum[0];
|
|
||||||
//}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Accumulate the sum from all threads in the threadgroup
|
// Accumulate the sum from all threads in the threadgroup
|
||||||
// This version is slightly faster than the commented out one below,
|
|
||||||
// which I copy-pasted from ggerganov's q4_0 dot product for metal.
|
|
||||||
//
|
//
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
if (ith%4 == 0) {
|
if (ith%4 == 0) {
|
||||||
|
@ -1190,7 +1325,7 @@ kernel void kernel_mul_mat_q2_k_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q3_k_f32(
|
kernel void kernel_mul_mat_q3_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1203,23 +1338,25 @@ kernel void kernel_mul_mat_q3_k_f32(
|
||||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint2 tptg[[threads_per_threadgroup]]) {
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x0303;
|
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
|
||||||
|
|
||||||
const uint8_t m3 = 3;
|
|
||||||
const int8_t m4 = 4;
|
|
||||||
|
|
||||||
const int nb = ne00/QK_K;
|
const int nb = ne00/QK_K;
|
||||||
|
|
||||||
const int64_t r0 = tgpig.x;
|
const int64_t r0 = tgpig.x;
|
||||||
const int64_t r1 = tgpig.y;
|
const int64_t r1 = tgpig.y;
|
||||||
|
|
||||||
device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb;
|
device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb;
|
||||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
const int nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
|
||||||
|
const uint8_t m3 = 3;
|
||||||
|
const int8_t m4 = 4;
|
||||||
|
|
||||||
|
const uint16_t kmask1 = 0x0303;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
|
||||||
const int tid = tpitg.y; // expecting 16
|
const int tid = tpitg.y; // expecting 16
|
||||||
const int ip = tid/8; // 0 or 1
|
const int ip = tid/8; // 0 or 1
|
||||||
const int il = tid/2 - 4*ip; // 0...3
|
const int il = tid/2 - 4*ip; // 0...3
|
||||||
|
@ -1273,6 +1410,39 @@ kernel void kernel_mul_mat_q3_k_f32(
|
||||||
|
|
||||||
//sum[ith] = sumf;
|
//sum[ith] = sumf;
|
||||||
sum[ith] = sumf1 - 32.f*sumf2;
|
sum[ith] = sumf1 - 32.f*sumf2;
|
||||||
|
#else
|
||||||
|
const int il = 4 * tpitg.x; // 0, 4, 8, 12
|
||||||
|
const int im = il/8; // 0, 0, 1, 1
|
||||||
|
const int in = il%8; // 0, 4, 0, 4
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||||
|
|
||||||
|
const float d_all = (float)(x[i].d);
|
||||||
|
|
||||||
|
device const uint8_t * q = x[i].qs + il;
|
||||||
|
device const uint8_t * h = x[i].hmask + in;
|
||||||
|
device const float * y = yy + i * QK_K + il;
|
||||||
|
|
||||||
|
const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
|
||||||
|
const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
|
||||||
|
const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
|
||||||
|
const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
|
||||||
|
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
const uint8_t hm = h[l] >> im;
|
||||||
|
sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4))
|
||||||
|
+ y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4))
|
||||||
|
+ y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4))
|
||||||
|
+ y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sum[ith] = sumf;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// Accumulate the sum from all threads in the threadgroup
|
// Accumulate the sum from all threads in the threadgroup
|
||||||
|
@ -1293,7 +1463,7 @@ kernel void kernel_mul_mat_q3_k_f32(
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q4_k_f32(
|
kernel void kernel_mul_mat_q4_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1305,21 +1475,25 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint2 tptg[[threads_per_threadgroup]]) {
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
|
||||||
|
|
||||||
const int nb = ne00/QK_K;
|
const int nb = ne00/QK_K;
|
||||||
|
|
||||||
const int64_t r0 = tgpig.x;
|
const int64_t r0 = tgpig.x;
|
||||||
const int64_t r1 = tgpig.y;
|
const int64_t r1 = tgpig.y;
|
||||||
|
|
||||||
device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
|
|
||||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
|
||||||
|
|
||||||
const int nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
|
device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
|
||||||
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
|
||||||
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
|
||||||
const int tid = tpitg.y; // 0...16
|
const int tid = tpitg.y; // 0...16
|
||||||
const int il = tid/4; // 0...3
|
const int il = tid/4; // 0...3
|
||||||
const int ir = tid - 4*il;// 0...3
|
const int ir = tid - 4*il;// 0...3
|
||||||
|
@ -1332,11 +1506,8 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||||
const int q_offset = 32*im + l0;
|
const int q_offset = 32*im + l0;
|
||||||
const int y_offset = 64*im + l0;
|
const int y_offset = 64*im + l0;
|
||||||
|
|
||||||
sum[ith] = 0.0f;
|
|
||||||
|
|
||||||
uchar2 sc1, sc2, sc3, sc4;
|
uchar2 sc1, sc2, sc3, sc4;
|
||||||
|
|
||||||
float sumf = 0;
|
|
||||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
device const uint8_t * q1 = (x + i)->qs + q_offset;
|
device const uint8_t * q1 = (x + i)->qs + q_offset;
|
||||||
|
@ -1365,6 +1536,30 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||||
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
uint16_t aux16[2];
|
||||||
|
thread const uint8_t * scales = (thread const uint8_t *)aux16;
|
||||||
|
|
||||||
|
const int il = 4*tpitg.x;
|
||||||
|
|
||||||
|
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||||
|
|
||||||
|
device const uint8_t * q = x[i].qs + il;
|
||||||
|
device const float * y = yy + i * QK_K + il;
|
||||||
|
|
||||||
|
const float d = (float)x[i].d[0];
|
||||||
|
const float m = (float)x[i].d[1];
|
||||||
|
|
||||||
|
device const uint16_t * a = (device const uint16_t *)x[i].scales;
|
||||||
|
aux16[0] = a[0] & 0x0f0f;
|
||||||
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||||
|
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
|
||||||
|
+ d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
sum[ith] = sumf;
|
sum[ith] = sumf;
|
||||||
|
|
||||||
|
@ -1401,7 +1596,7 @@ kernel void kernel_mul_mat_q4_k_f32(
|
||||||
//}
|
//}
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q5_k_f32(
|
kernel void kernel_mul_mat_q5_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1413,21 +1608,25 @@ kernel void kernel_mul_mat_q5_k_f32(
|
||||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint2 tptg[[threads_per_threadgroup]]) {
|
uint2 tptg[[threads_per_threadgroup]]) {
|
||||||
|
|
||||||
const uint16_t kmask1 = 0x3f3f;
|
|
||||||
const uint16_t kmask2 = 0x0f0f;
|
|
||||||
const uint16_t kmask3 = 0xc0c0;
|
|
||||||
|
|
||||||
const int nb = ne00/QK_K;
|
const int nb = ne00/QK_K;
|
||||||
|
|
||||||
const int64_t r0 = tgpig.x;
|
const int64_t r0 = tgpig.x;
|
||||||
const int64_t r1 = tgpig.y;
|
const int64_t r1 = tgpig.y;
|
||||||
|
|
||||||
device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb;
|
device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
|
||||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
const int nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
|
||||||
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
|
||||||
const int tid = tpitg.y; // 0...16
|
const int tid = tpitg.y; // 0...16
|
||||||
const int il = tid/4; // 0...3
|
const int il = tid/4; // 0...3
|
||||||
const int ir = tid - 4*il;// 0...3
|
const int ir = tid - 4*il;// 0...3
|
||||||
|
@ -1447,7 +1646,6 @@ kernel void kernel_mul_mat_q5_k_f32(
|
||||||
|
|
||||||
uchar2 sc1, sc2, sc3, sc4;
|
uchar2 sc1, sc2, sc3, sc4;
|
||||||
|
|
||||||
float sumf = 0;
|
|
||||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
device const uint8_t * q1 = (x + i)->qs + q_offset;
|
device const uint8_t * q1 = (x + i)->qs + q_offset;
|
||||||
|
@ -1479,6 +1677,28 @@ kernel void kernel_mul_mat_q5_k_f32(
|
||||||
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
const int il = 4 * tpitg.x; // 0, 4, 8, 12
|
||||||
|
const int im = il/8; // 0, 0, 1, 1
|
||||||
|
const int in = il%8; // 0, 4, 0, 4
|
||||||
|
|
||||||
|
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||||
|
|
||||||
|
const float d = (float)x[i].d;
|
||||||
|
device const uint8_t * q = x[i].qs + il;
|
||||||
|
device const uint8_t * h = x[i].qh + in;
|
||||||
|
device const int8_t * s = x[i].scales;
|
||||||
|
device const float * y = yy + i*QK_K + il;
|
||||||
|
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
const uint8_t hl = h[l] >> im;
|
||||||
|
sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
|
||||||
|
+ y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
|
||||||
|
+ y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16))
|
||||||
|
+ y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
sum[ith] = sumf;
|
sum[ith] = sumf;
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -1500,7 +1720,7 @@ kernel void kernel_mul_mat_q5_k_f32(
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_mul_mat_q6_k_f32(
|
kernel void kernel_mul_mat_q6_K_f32(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -1522,12 +1742,15 @@ kernel void kernel_mul_mat_q6_k_f32(
|
||||||
const int64_t r0 = tgpig.x;
|
const int64_t r0 = tgpig.x;
|
||||||
const int64_t r1 = tgpig.y;
|
const int64_t r1 = tgpig.y;
|
||||||
|
|
||||||
device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb;
|
device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
|
||||||
device const float * yy = (device const float *) src1 + r1*ne10;
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
||||||
|
|
||||||
const int nth = tptg.x*tptg.y;
|
const int nth = tptg.x*tptg.y;
|
||||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||||
|
|
||||||
|
float sumf = 0;
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
// Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
|
// Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
|
||||||
const int iqs = 16 * tpitg.y;
|
const int iqs = 16 * tpitg.y;
|
||||||
const int ip = iqs / 128; // 0 or 1
|
const int ip = iqs / 128; // 0 or 1
|
||||||
|
@ -1540,7 +1763,6 @@ kernel void kernel_mul_mat_q6_k_f32(
|
||||||
const int q_offset_l = 64*ip + l0;
|
const int q_offset_l = 64*ip + l0;
|
||||||
const int q_offset_h = 32*ip + l0;
|
const int q_offset_h = 32*ip + l0;
|
||||||
|
|
||||||
float sumf = 0;
|
|
||||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||||
|
|
||||||
device const uint8_t * ql = x[i].ql + q_offset_l;
|
device const uint8_t * ql = x[i].ql + q_offset_l;
|
||||||
|
@ -1562,6 +1784,28 @@ kernel void kernel_mul_mat_q6_k_f32(
|
||||||
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
|
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
const int il = 4*tpitg.x; // 0, 4, 8, 12
|
||||||
|
|
||||||
|
for (int i = tpitg.y; i < nb; i += tptg.y) {
|
||||||
|
device const float * y = yy + i * QK_K + il;
|
||||||
|
device const uint8_t * ql = x[i].ql + il;
|
||||||
|
device const uint8_t * qh = x[i].qh + il;
|
||||||
|
device const int8_t * s = x[i].scales;
|
||||||
|
|
||||||
|
const float d = x[i].d;
|
||||||
|
|
||||||
|
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
|
||||||
|
sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
|
||||||
|
sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
|
||||||
|
sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
|
||||||
|
}
|
||||||
|
sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
sum[ith] = sumf;
|
sum[ith] = sumf;
|
||||||
|
|
||||||
|
|
483
ggml.c
483
ggml.c
|
@ -1,5 +1,5 @@
|
||||||
// Defines CLOCK_MONOTONIC on Linux
|
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
||||||
#define _GNU_SOURCE
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
@ -24,6 +24,7 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
@ -159,6 +160,34 @@ typedef void* thread_ret_t;
|
||||||
#define GGML_MEM_ALIGN 16
|
#define GGML_MEM_ALIGN 16
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// logging
|
||||||
|
//
|
||||||
|
|
||||||
|
#if (GGML_DEBUG >= 1)
|
||||||
|
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define GGML_PRINT_DEBUG(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (GGML_DEBUG >= 5)
|
||||||
|
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define GGML_PRINT_DEBUG_5(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (GGML_DEBUG >= 10)
|
||||||
|
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define GGML_PRINT_DEBUG_10(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||||
|
|
||||||
|
//
|
||||||
|
// end of logging block
|
||||||
|
//
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
||||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||||
|
@ -172,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
||||||
#endif
|
#endif
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
// Handle allocation failure
|
// Handle allocation failure
|
||||||
|
const char *error_desc = "unknown allocation error";
|
||||||
|
switch (result) {
|
||||||
|
case EINVAL:
|
||||||
|
error_desc = "invalid alignment value";
|
||||||
|
break;
|
||||||
|
case ENOMEM:
|
||||||
|
error_desc = "insufficient memory";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
||||||
|
__func__, error_desc, size/(1024.0*1024.0));
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return aligned_memory;
|
return aligned_memory;
|
||||||
|
@ -3717,11 +3757,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"MAP_UNARY",
|
"MAP_UNARY",
|
||||||
"MAP_BINARY",
|
"MAP_BINARY",
|
||||||
|
|
||||||
|
"MAP_CUSTOM1",
|
||||||
|
"MAP_CUSTOM2",
|
||||||
|
"MAP_CUSTOM3",
|
||||||
|
|
||||||
"CROSS_ENTROPY_LOSS",
|
"CROSS_ENTROPY_LOSS",
|
||||||
"CROSS_ENTROPY_LOSS_BACK",
|
"CROSS_ENTROPY_LOSS_BACK",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
|
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -3789,11 +3833,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"f(x)",
|
"f(x)",
|
||||||
"f(x,y)",
|
"f(x,y)",
|
||||||
|
|
||||||
|
"custom(x)",
|
||||||
|
"custom(x,y)",
|
||||||
|
"custom(x,y,z)",
|
||||||
|
|
||||||
"cross_entropy_loss(x,y)",
|
"cross_entropy_loss(x,y)",
|
||||||
"cross_entropy_loss_back(x,y)",
|
"cross_entropy_loss_back(x,y)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
|
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
||||||
|
|
||||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||||
|
@ -4821,10 +4869,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
return tensor;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_view_tensor(
|
struct ggml_tensor * ggml_view_tensor(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
const struct ggml_tensor * src) {
|
const struct ggml_tensor * src) {
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
||||||
|
ggml_format_name(result, "%s (view)", src->name);
|
||||||
|
|
||||||
result->nb[0] = src->nb[0];
|
result->nb[0] = src->nb[0];
|
||||||
result->nb[1] = src->nb[1];
|
result->nb[1] = src->nb[1];
|
||||||
|
@ -5986,6 +6043,11 @@ struct ggml_tensor * ggml_cpy_impl(
|
||||||
|
|
||||||
// make a view of the destination
|
// make a view of the destination
|
||||||
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
||||||
|
if (strlen(b->name) > 0) {
|
||||||
|
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
||||||
|
} else {
|
||||||
|
ggml_format_name(result, "%s (copy)", a->name);
|
||||||
|
}
|
||||||
|
|
||||||
result->op = GGML_OP_CPY;
|
result->op = GGML_OP_CPY;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6022,6 +6084,7 @@ struct ggml_tensor * ggml_cont_impl(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
ggml_format_name(result, "%s (cont)", a->name);
|
||||||
|
|
||||||
result->op = GGML_OP_CONT;
|
result->op = GGML_OP_CONT;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6065,6 +6128,7 @@ struct ggml_tensor * ggml_reshape(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
||||||
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||||
|
|
||||||
result->op = GGML_OP_RESHAPE;
|
result->op = GGML_OP_RESHAPE;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6089,6 +6153,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
||||||
|
|
||||||
const int64_t ne[1] = { ne0 };
|
const int64_t ne[1] = { ne0 };
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
||||||
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||||
|
|
||||||
result->op = GGML_OP_RESHAPE;
|
result->op = GGML_OP_RESHAPE;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6114,6 +6179,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
||||||
|
|
||||||
const int64_t ne[2] = { ne0, ne1 };
|
const int64_t ne[2] = { ne0, ne1 };
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
||||||
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||||
|
|
||||||
result->op = GGML_OP_RESHAPE;
|
result->op = GGML_OP_RESHAPE;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6140,6 +6206,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
||||||
|
|
||||||
const int64_t ne[3] = { ne0, ne1, ne2 };
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
||||||
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||||
|
|
||||||
result->op = GGML_OP_RESHAPE;
|
result->op = GGML_OP_RESHAPE;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6168,6 +6235,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
||||||
|
|
||||||
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
||||||
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
||||||
|
|
||||||
result->op = GGML_OP_RESHAPE;
|
result->op = GGML_OP_RESHAPE;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6192,10 +6260,12 @@ struct ggml_tensor * ggml_view_1d(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
||||||
|
ggml_format_name(result, "%s (view)", a->name);
|
||||||
|
|
||||||
ggml_scratch_save(ctx);
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||||
|
ggml_set_name(offs, "offset");
|
||||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||||
|
|
||||||
ggml_scratch_load(ctx);
|
ggml_scratch_load(ctx);
|
||||||
|
@ -6228,10 +6298,12 @@ struct ggml_tensor * ggml_view_2d(
|
||||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
||||||
|
ggml_format_name(result, "%s (view)", a->name);
|
||||||
|
|
||||||
ggml_scratch_save(ctx);
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||||
|
ggml_set_name(offs, "offset");
|
||||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||||
|
|
||||||
ggml_scratch_load(ctx);
|
ggml_scratch_load(ctx);
|
||||||
|
@ -6270,10 +6342,12 @@ struct ggml_tensor * ggml_view_3d(
|
||||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
||||||
|
ggml_format_name(result, "%s (view)", a->name);
|
||||||
|
|
||||||
ggml_scratch_save(ctx);
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||||
|
ggml_set_name(offs, "offset");
|
||||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||||
|
|
||||||
ggml_scratch_load(ctx);
|
ggml_scratch_load(ctx);
|
||||||
|
@ -6314,10 +6388,12 @@ struct ggml_tensor * ggml_view_4d(
|
||||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
||||||
|
ggml_format_name(result, "%s (view)", a->name);
|
||||||
|
|
||||||
ggml_scratch_save(ctx);
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||||
|
ggml_set_name(offs, "offset");
|
||||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||||
|
|
||||||
ggml_scratch_load(ctx);
|
ggml_scratch_load(ctx);
|
||||||
|
@ -6363,6 +6439,7 @@ struct ggml_tensor * ggml_permute(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
||||||
|
ggml_format_name(result, "%s (permuted)", a->name);
|
||||||
|
|
||||||
int ne[GGML_MAX_DIMS];
|
int ne[GGML_MAX_DIMS];
|
||||||
int nb[GGML_MAX_DIMS];
|
int nb[GGML_MAX_DIMS];
|
||||||
|
@ -6422,6 +6499,7 @@ struct ggml_tensor * ggml_transpose(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
||||||
|
ggml_format_name(result, "%s (transposed)", a->name);
|
||||||
|
|
||||||
result->ne[0] = a->ne[1];
|
result->ne[0] = a->ne[1];
|
||||||
result->ne[1] = a->ne[0];
|
result->ne[1] = a->ne[0];
|
||||||
|
@ -7150,9 +7228,14 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||||
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||||
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
||||||
|
ggml_scratch_load(ctx);
|
||||||
|
|
||||||
result->op = GGML_OP_MAP_UNARY;
|
result->op = GGML_OP_MAP_UNARY;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -7192,9 +7275,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||||
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||||
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
||||||
|
ggml_scratch_load(ctx);
|
||||||
|
|
||||||
result->op = GGML_OP_MAP_BINARY;
|
result->op = GGML_OP_MAP_BINARY;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -7221,6 +7309,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
||||||
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_map_custom1
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom1_impl_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
const ggml_custom1_op_f32_t fun,
|
||||||
|
bool inplace) {
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (!inplace && a->grad) {
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||||
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||||
|
|
||||||
|
ggml_scratch_load(ctx);
|
||||||
|
|
||||||
|
result->op = GGML_OP_MAP_CUSTOM1;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src0 = a;
|
||||||
|
result->opt[0] = addr_tensor;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom1_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
const ggml_custom1_op_f32_t fun) {
|
||||||
|
return ggml_map_custom1_impl_f32(ctx, a, fun, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
const ggml_custom1_op_f32_t fun) {
|
||||||
|
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_map_custom2
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom2_impl_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
const ggml_custom2_op_f32_t fun,
|
||||||
|
bool inplace) {
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (!inplace && (a->grad || b->grad)) {
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||||
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||||
|
|
||||||
|
ggml_scratch_load(ctx);
|
||||||
|
|
||||||
|
result->op = GGML_OP_MAP_CUSTOM2;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src0 = a;
|
||||||
|
result->src1 = b;
|
||||||
|
result->opt[0] = addr_tensor;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom2_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
const ggml_custom2_op_f32_t fun) {
|
||||||
|
return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
const ggml_custom2_op_f32_t fun) {
|
||||||
|
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_map_custom3
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom3_impl_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
const ggml_custom3_op_f32_t fun,
|
||||||
|
bool inplace) {
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (!inplace && (a->grad || b->grad || c->grad)) {
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
ggml_scratch_save(ctx);
|
||||||
|
|
||||||
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||||
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||||
|
|
||||||
|
ggml_scratch_load(ctx);
|
||||||
|
|
||||||
|
result->op = GGML_OP_MAP_CUSTOM3;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src0 = a;
|
||||||
|
result->src1 = b;
|
||||||
|
result->opt[0] = addr_tensor;
|
||||||
|
result->opt[1] = c;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom3_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
const ggml_custom3_op_f32_t fun) {
|
||||||
|
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
const ggml_custom3_op_f32_t fun) {
|
||||||
|
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_cross_entropy_loss
|
// ggml_cross_entropy_loss
|
||||||
|
|
||||||
struct ggml_tensor * ggml_cross_entropy_loss(
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
||||||
|
@ -14677,6 +14909,114 @@ static void ggml_compute_forward_map_binary(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_map_custom1
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_custom1_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_custom1_op_f32_t fun) {
|
||||||
|
assert(params->ith == 0);
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fun(dst, a);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_custom1(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_custom1_op_f32_t fun) {
|
||||||
|
switch (a->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_map_custom2
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_custom2_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * a,
|
||||||
|
const struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_custom2_op_f32_t fun) {
|
||||||
|
assert(params->ith == 0);
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fun(dst, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_custom2(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * a,
|
||||||
|
const struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_custom2_op_f32_t fun) {
|
||||||
|
switch (a->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_map_custom3
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_custom3_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * a,
|
||||||
|
const struct ggml_tensor * b,
|
||||||
|
const struct ggml_tensor * c,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_custom3_op_f32_t fun) {
|
||||||
|
assert(params->ith == 0);
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fun(dst, a, b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void ggml_compute_forward_map_custom3(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * a,
|
||||||
|
const struct ggml_tensor * b,
|
||||||
|
const struct ggml_tensor * c,
|
||||||
|
struct ggml_tensor * dst,
|
||||||
|
const ggml_custom3_op_f32_t fun) {
|
||||||
|
switch (a->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_cross_entropy_loss
|
// ggml_compute_forward_cross_entropy_loss
|
||||||
|
|
||||||
static void ggml_compute_forward_cross_entropy_loss_f32(
|
static void ggml_compute_forward_cross_entropy_loss_f32(
|
||||||
|
@ -14967,7 +15307,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
if (skip_cpu) {
|
if (skip_cpu) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
|
||||||
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
@ -15214,6 +15554,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_MAP_CUSTOM1:
|
||||||
|
{
|
||||||
|
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
|
||||||
|
ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case GGML_OP_MAP_CUSTOM2:
|
||||||
|
{
|
||||||
|
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
|
||||||
|
ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case GGML_OP_MAP_CUSTOM3:
|
||||||
|
{
|
||||||
|
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
|
||||||
|
ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
|
||||||
|
}
|
||||||
|
break;
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
|
ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
|
||||||
|
@ -16020,6 +16378,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
case GGML_OP_WIN_UNPART:
|
case GGML_OP_WIN_UNPART:
|
||||||
case GGML_OP_MAP_UNARY:
|
case GGML_OP_MAP_UNARY:
|
||||||
case GGML_OP_MAP_BINARY:
|
case GGML_OP_MAP_BINARY:
|
||||||
|
case GGML_OP_MAP_CUSTOM1:
|
||||||
|
case GGML_OP_MAP_CUSTOM2:
|
||||||
|
case GGML_OP_MAP_CUSTOM3:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // not supported
|
GGML_ASSERT(false); // not supported
|
||||||
} break;
|
} break;
|
||||||
|
@ -16091,7 +16452,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
||||||
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
||||||
|
|
||||||
if (strlen(node->name) == 0) {
|
if (strlen(node->name) == 0) {
|
||||||
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
|
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
||||||
}
|
}
|
||||||
|
|
||||||
cgraph->leafs[cgraph->n_leafs] = node;
|
cgraph->leafs[cgraph->n_leafs] = node;
|
||||||
|
@ -16100,7 +16461,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
||||||
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
||||||
|
|
||||||
if (strlen(node->name) == 0) {
|
if (strlen(node->name) == 0) {
|
||||||
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
|
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
cgraph->nodes[cgraph->n_nodes] = node;
|
cgraph->nodes[cgraph->n_nodes] = node;
|
||||||
|
@ -16727,6 +17088,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
case GGML_OP_WIN_UNPART:
|
case GGML_OP_WIN_UNPART:
|
||||||
case GGML_OP_MAP_UNARY:
|
case GGML_OP_MAP_UNARY:
|
||||||
case GGML_OP_MAP_BINARY:
|
case GGML_OP_MAP_BINARY:
|
||||||
|
case GGML_OP_MAP_CUSTOM1:
|
||||||
|
case GGML_OP_MAP_CUSTOM2:
|
||||||
|
case GGML_OP_MAP_CUSTOM3:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1;
|
node->n_tasks = 1;
|
||||||
} break;
|
} break;
|
||||||
|
@ -17422,6 +17786,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
||||||
|
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
||||||
|
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
||||||
|
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
||||||
|
gparent0 ? (void *) gparent0 : (void *) parent,
|
||||||
|
gparent0 ? "g" : "x",
|
||||||
|
gparent ? (void *) gparent : (void *) node,
|
||||||
|
gparent ? "g" : "x",
|
||||||
|
gparent ? "empty" : "vee",
|
||||||
|
gparent ? "dashed" : "solid",
|
||||||
|
label);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
||||||
|
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
||||||
|
(void *) parent, "x",
|
||||||
|
(void *) node, "x",
|
||||||
|
label);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
||||||
char color[16];
|
char color[16];
|
||||||
|
|
||||||
|
@ -17457,7 +17841,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||||
(void *) node, color);
|
(void *) node, color);
|
||||||
|
|
||||||
if (strlen(node->name) > 0) {
|
if (strlen(node->name) > 0) {
|
||||||
fprintf(fp, "%s |", node->name);
|
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
||||||
|
} else {
|
||||||
|
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->n_dims == 2) {
|
if (node->n_dims == 2) {
|
||||||
|
@ -17466,7 +17852,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (node->grad) {
|
if (node->grad) {
|
||||||
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
||||||
} else {
|
} else {
|
||||||
|
@ -17485,18 +17870,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||||
(void *) node, color);
|
(void *) node, color);
|
||||||
|
|
||||||
if (strlen(node->name) > 0) {
|
if (strlen(node->name) > 0) {
|
||||||
fprintf(fp, "%s | ", node->name);
|
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
||||||
|
} else {
|
||||||
|
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
||||||
}
|
}
|
||||||
if (ggml_nelements(node) == 1) {
|
|
||||||
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
|
||||||
fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
||||||
|
if (ggml_nelements(node) < 5) {
|
||||||
|
fprintf(fp, " | (");
|
||||||
|
for (int j = 0; j < ggml_nelements(node); j++) {
|
||||||
|
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
||||||
|
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
|
||||||
|
}
|
||||||
|
else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
|
||||||
|
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(fp, "#");
|
||||||
|
}
|
||||||
|
if (j < ggml_nelements(node) - 1) {
|
||||||
|
fprintf(fp, ", ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fprintf(fp, ")");
|
||||||
}
|
}
|
||||||
fprintf(fp, "\"; ]\n");
|
fprintf(fp, "\"; ]\n");
|
||||||
}
|
}
|
||||||
|
@ -17504,30 +17900,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||||
for (int i = 0; i < gb->n_nodes; i++) {
|
for (int i = 0; i < gb->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = gb->nodes[i];
|
struct ggml_tensor * node = gb->nodes[i];
|
||||||
|
|
||||||
struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
|
|
||||||
|
|
||||||
if (node->src0) {
|
if (node->src0) {
|
||||||
struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
|
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
|
||||||
|
|
||||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
|
|
||||||
parent0 ? (void *) parent0 : (void *) node->src0,
|
|
||||||
parent0 ? "g" : "x",
|
|
||||||
parent ? (void *) parent : (void *) node,
|
|
||||||
parent ? "g" : "x",
|
|
||||||
parent ? "empty" : "vee",
|
|
||||||
parent ? "dashed" : "solid");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->src1) {
|
if (node->src1) {
|
||||||
struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
|
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
|
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
||||||
parent1 ? (void *) parent1 : (void *) node->src1,
|
if (node->opt[j]) {
|
||||||
parent1 ? "g" : "x",
|
char label[16];
|
||||||
parent ? (void *) parent : (void *) node,
|
snprintf(label, sizeof(label), "opt %d", j);
|
||||||
parent ? "g" : "x",
|
ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
|
||||||
parent ? "empty" : "vee",
|
}
|
||||||
parent ? "dashed" : "solid");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17535,15 +17921,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||||
struct ggml_tensor * node = gb->leafs[i];
|
struct ggml_tensor * node = gb->leafs[i];
|
||||||
|
|
||||||
if (node->src0) {
|
if (node->src0) {
|
||||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
|
ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
|
||||||
(void *) node->src0, "x",
|
|
||||||
(void *) node, "x");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->src1) {
|
if (node->src1) {
|
||||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
|
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
||||||
(void *) node->src1, "x",
|
}
|
||||||
(void *) node, "x");
|
|
||||||
|
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
||||||
|
if (node->opt[j]) {
|
||||||
|
char label[16];
|
||||||
|
snprintf(label, sizeof(label), "opt %d", j);
|
||||||
|
ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18262,7 +18652,6 @@ GGML_API void ggml_opt_init(
|
||||||
ggml_set_zero(opt->lbfgs.g);
|
ggml_set_zero(opt->lbfgs.g);
|
||||||
ggml_set_zero(opt->lbfgs.gp);
|
ggml_set_zero(opt->lbfgs.gp);
|
||||||
ggml_set_zero(opt->lbfgs.d);
|
ggml_set_zero(opt->lbfgs.d);
|
||||||
ggml_set_zero(opt->lbfgs.pf);
|
|
||||||
if (opt->lbfgs.pf) {
|
if (opt->lbfgs.pf) {
|
||||||
ggml_set_zero(opt->lbfgs.pf);
|
ggml_set_zero(opt->lbfgs.pf);
|
||||||
}
|
}
|
||||||
|
|
59
ggml.h
59
ggml.h
|
@ -345,6 +345,10 @@ extern "C" {
|
||||||
GGML_OP_MAP_UNARY,
|
GGML_OP_MAP_UNARY,
|
||||||
GGML_OP_MAP_BINARY,
|
GGML_OP_MAP_BINARY,
|
||||||
|
|
||||||
|
GGML_OP_MAP_CUSTOM1,
|
||||||
|
GGML_OP_MAP_CUSTOM2,
|
||||||
|
GGML_OP_MAP_CUSTOM3,
|
||||||
|
|
||||||
GGML_OP_CROSS_ENTROPY_LOSS,
|
GGML_OP_CROSS_ENTROPY_LOSS,
|
||||||
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||||
|
|
||||||
|
@ -566,6 +570,7 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
||||||
|
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
|
||||||
|
|
||||||
//
|
//
|
||||||
// operations on tensors with backpropagation
|
// operations on tensors with backpropagation
|
||||||
|
@ -1169,21 +1174,73 @@ extern "C" {
|
||||||
int h0,
|
int h0,
|
||||||
int w);
|
int w);
|
||||||
|
|
||||||
// Mapping operations
|
// custom operators
|
||||||
|
|
||||||
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
||||||
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||||
|
|
||||||
|
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
||||||
|
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
||||||
|
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
ggml_unary_op_f32_t fun);
|
ggml_unary_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
ggml_unary_op_f32_t fun);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
ggml_binary_op_f32_t fun);
|
ggml_binary_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
ggml_binary_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
ggml_custom1_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
ggml_custom1_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
ggml_custom2_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
ggml_custom2_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
ggml_custom3_op_f32_t fun);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
ggml_custom3_op_f32_t fun);
|
||||||
|
|
||||||
// loss function
|
// loss function
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
||||||
|
|
1686
k_quants.c
1686
k_quants.c
File diff suppressed because it is too large
Load diff
51
k_quants.h
51
k_quants.h
|
@ -7,7 +7,13 @@
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
|
||||||
// Super-block size
|
// Super-block size
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
|
#define QK_K 64
|
||||||
|
#define K_SCALE_SIZE 4
|
||||||
|
#else
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
|
#define K_SCALE_SIZE 12
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// Super-block quantization structures
|
// Super-block quantization structures
|
||||||
|
@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
// 16 blocks of 16 elemenets each
|
// 16 blocks of 16 elemenets each
|
||||||
// Effectively 3.4375 bits per weight
|
// Effectively 3.4375 bits per weight
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||||
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
uint8_t scales[2];
|
||||||
ggml_fp16_t d; // super-block scale
|
ggml_fp16_t d; // super-block scale
|
||||||
} block_q3_K;
|
} block_q3_K;
|
||||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
||||||
|
#else
|
||||||
|
typedef struct {
|
||||||
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||||
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||||
|
uint8_t scales[12]; // scales, quantized with 6 bits
|
||||||
|
ggml_fp16_t d; // super-block scale
|
||||||
|
} block_q3_K;
|
||||||
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
||||||
|
#endif
|
||||||
|
|
||||||
// 4-bit quantization
|
// 4-bit quantization
|
||||||
// 16 blocks of 32 elements each
|
// 16 blocks of 32 elements each
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// Effectively 4.5 bits per weight
|
// Effectively 4.5 bits per weight
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d[2]; // super-block scales/mins
|
||||||
|
uint8_t scales[2]; // 4-bit block scales/mins
|
||||||
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
|
} block_q4_K;
|
||||||
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||||
|
#else
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_fp16_t d; // super-block scale for quantized scales
|
ggml_fp16_t d; // super-block scale for quantized scales
|
||||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||||
} block_q4_K;
|
} block_q4_K;
|
||||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
||||||
|
#endif
|
||||||
|
|
||||||
// 5-bit quantization
|
// 5-bit quantization
|
||||||
// 16 blocks of 32 elements each
|
// 16 blocks of 32 elements each
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// Effectively 5.5 bits per weight
|
// Effectively 5.5 bits per weight
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_fp16_t d; // super-block scale for quantized scales
|
ggml_fp16_t d; // super-block scale
|
||||||
ggml_fp16_t dmin; // super-block scale for quantized mins
|
int8_t scales[QK_K/16]; // 8-bit block scales
|
||||||
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
|
||||||
uint8_t qh[QK_K/8]; // quants, high bit
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
} block_q5_K;
|
} block_q5_K;
|
||||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
||||||
|
#else
|
||||||
|
typedef struct {
|
||||||
|
ggml_fp16_t d; // super-block scale for quantized scales
|
||||||
|
ggml_fp16_t dmin; // super-block scale for quantized mins
|
||||||
|
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||||
|
uint8_t qh[QK_K/8]; // quants, high bit
|
||||||
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||||
|
} block_q5_K;
|
||||||
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||||
|
#endif
|
||||||
|
|
||||||
// 6-bit quantization
|
// 6-bit quantization
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
|
|
200
llama.cpp
200
llama.cpp
|
@ -21,9 +21,13 @@
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
#ifndef QK_K
|
#ifndef QK_K
|
||||||
|
#ifdef GGML_QKK_64
|
||||||
|
#define QK_K 64
|
||||||
|
#else
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
@ -182,6 +186,19 @@ struct llama_kv_cache {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llama_vocab {
|
||||||
|
using id = int32_t;
|
||||||
|
using token = std::string;
|
||||||
|
|
||||||
|
struct token_score {
|
||||||
|
token tok;
|
||||||
|
float score;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unordered_map<token, id> token_to_id;
|
||||||
|
std::vector<token_score> id_to_token;
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_model {
|
struct llama_model {
|
||||||
e_model type = MODEL_UNKNOWN;
|
e_model type = MODEL_UNKNOWN;
|
||||||
|
|
||||||
|
@ -198,10 +215,6 @@ struct llama_model {
|
||||||
// context
|
// context
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
// key + value cache for the self attention
|
|
||||||
// TODO: move to llama_state
|
|
||||||
struct llama_kv_cache kv_self;
|
|
||||||
|
|
||||||
// the model memory buffer
|
// the model memory buffer
|
||||||
llama_ctx_buffer buf;
|
llama_ctx_buffer buf;
|
||||||
|
|
||||||
|
@ -215,6 +228,11 @@ struct llama_model {
|
||||||
// for quantize-stats only
|
// for quantize-stats only
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||||
|
|
||||||
|
int64_t t_load_us = 0;
|
||||||
|
int64_t t_start_us = 0;
|
||||||
|
|
||||||
|
llama_vocab vocab;
|
||||||
|
|
||||||
~llama_model() {
|
~llama_model() {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
|
@ -233,24 +251,11 @@ struct llama_model {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_vocab {
|
|
||||||
using id = int32_t;
|
|
||||||
using token = std::string;
|
|
||||||
|
|
||||||
struct token_score {
|
|
||||||
token tok;
|
|
||||||
float score;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::unordered_map<token, id> token_to_id;
|
|
||||||
std::vector<token_score> id_to_token;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
|
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||||
|
|
||||||
std::mt19937 rng;
|
std::mt19937 rng;
|
||||||
|
|
||||||
int64_t t_load_us = 0;
|
|
||||||
int64_t t_start_us = 0;
|
|
||||||
bool has_evaluated_once = false;
|
bool has_evaluated_once = false;
|
||||||
|
|
||||||
int64_t t_sample_us = 0;
|
int64_t t_sample_us = 0;
|
||||||
|
@ -261,8 +266,16 @@ struct llama_context {
|
||||||
int32_t n_eval = 0; // number of eval calls
|
int32_t n_eval = 0; // number of eval calls
|
||||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||||
|
|
||||||
llama_model model;
|
const llama_model & model;
|
||||||
llama_vocab vocab;
|
const llama_vocab & vocab;
|
||||||
|
|
||||||
|
bool model_owner = false;
|
||||||
|
|
||||||
|
int64_t t_load_us;
|
||||||
|
int64_t t_start_us;
|
||||||
|
|
||||||
|
// key + value cache for the self attention
|
||||||
|
struct llama_kv_cache kv_self;
|
||||||
|
|
||||||
size_t mem_per_token = 0;
|
size_t mem_per_token = 0;
|
||||||
|
|
||||||
|
@ -925,21 +938,21 @@ static bool kv_cache_init(
|
||||||
|
|
||||||
struct llama_context_params llama_context_default_params() {
|
struct llama_context_params llama_context_default_params() {
|
||||||
struct llama_context_params result = {
|
struct llama_context_params result = {
|
||||||
|
/*.seed =*/ -1,
|
||||||
/*.n_ctx =*/ 512,
|
/*.n_ctx =*/ 512,
|
||||||
/*.n_batch =*/ 512,
|
/*.n_batch =*/ 512,
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ {0},
|
/*.tensor_split =*/ {0},
|
||||||
|
/*.progress_callback =*/ nullptr,
|
||||||
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
/*.low_vram =*/ false,
|
/*.low_vram =*/ false,
|
||||||
/*.seed =*/ -1,
|
|
||||||
/*.f16_kv =*/ true,
|
/*.f16_kv =*/ true,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
/*.use_mlock =*/ false,
|
/*.use_mlock =*/ false,
|
||||||
/*.embedding =*/ false,
|
/*.embedding =*/ false,
|
||||||
/*.progress_callback =*/ nullptr,
|
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -1033,7 +1046,8 @@ static const char *llama_model_type_name(e_model type) {
|
||||||
|
|
||||||
static void llama_model_load_internal(
|
static void llama_model_load_internal(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
llama_context & lctx,
|
llama_model & model,
|
||||||
|
llama_vocab & vocab,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
@ -1047,12 +1061,11 @@ static void llama_model_load_internal(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
|
|
||||||
lctx.t_start_us = ggml_time_us();
|
model.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
||||||
|
|
||||||
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
|
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
||||||
auto & model = lctx.model;
|
|
||||||
model.hparams = ml->file_loaders.at(0)->hparams;
|
model.hparams = ml->file_loaders.at(0)->hparams;
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
||||||
|
@ -1122,15 +1135,15 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
lctx.model.buf.resize(ctx_size);
|
model.buf.resize(ctx_size);
|
||||||
if (use_mlock) {
|
if (use_mlock) {
|
||||||
lctx.model.mlock_buf.init(lctx.model.buf.addr);
|
model.mlock_buf.init(model.buf.addr);
|
||||||
lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
|
model.mlock_buf.grow_to(model.buf.size);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ lctx.model.buf.size,
|
/*.mem_size =*/ model.buf.size,
|
||||||
/*.mem_buffer =*/ lctx.model.buf.addr,
|
/*.mem_buffer =*/ model.buf.addr,
|
||||||
/*.no_alloc =*/ ml->use_mmap,
|
/*.no_alloc =*/ ml->use_mmap,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1311,7 +1324,7 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
progress_callback(1.0f, progress_callback_user_data);
|
progress_callback(1.0f, progress_callback_user_data);
|
||||||
|
@ -1321,12 +1334,13 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
// loading time will be recalculate after the first eval, so
|
// loading time will be recalculate after the first eval, so
|
||||||
// we take page faults deferred by mmap() into consideration
|
// we take page faults deferred by mmap() into consideration
|
||||||
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_model_load(
|
static bool llama_model_load(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
llama_context & lctx,
|
llama_model & model,
|
||||||
|
llama_vocab & vocab,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
@ -1340,7 +1354,7 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
|
@ -1378,7 +1392,7 @@ static bool llama_eval_internal(
|
||||||
const auto & model = lctx.model;
|
const auto & model = lctx.model;
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const auto & kv_self = model.kv_self;
|
const auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
LLAMA_ASSERT(!!kv_self.ctx);
|
LLAMA_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
|
@ -1726,7 +1740,7 @@ static bool llama_eval_internal(
|
||||||
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
||||||
|
|
||||||
// update kv token count
|
// update kv token count
|
||||||
lctx.model.kv_self.n = n_past + N;
|
lctx.kv_self.n = n_past + N;
|
||||||
|
|
||||||
// extract logits
|
// extract logits
|
||||||
{
|
{
|
||||||
|
@ -2005,9 +2019,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
cum_sum += candidates->data[i].p;
|
cum_sum += candidates->data[i].p;
|
||||||
|
|
||||||
// Check if the running sum is greater than p or if we have kept at least min_keep tokens
|
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||||
if (cum_sum > p && i >= min_keep) {
|
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||||
last_idx = i;
|
if (cum_sum >= p && i + 1 >= min_keep) {
|
||||||
|
last_idx = i + 1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2459,6 +2474,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
std::vector<std::thread> workers;
|
std::vector<std::thread> workers;
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
|
|
||||||
|
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
||||||
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
||||||
|
};
|
||||||
|
|
||||||
size_t idx = 0;
|
size_t idx = 0;
|
||||||
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
||||||
llama_buffer read_data;
|
llama_buffer read_data;
|
||||||
|
@ -2513,15 +2532,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||||
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||||
++i_attention_wv;
|
++i_attention_wv;
|
||||||
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
||||||
++i_feed_forward_w2;
|
++i_feed_forward_w2;
|
||||||
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
|
@ -2634,12 +2654,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
// interface implementation
|
// interface implementation
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_context * llama_init_from_file(
|
struct llama_model * llama_load_model_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_context_params params) {
|
struct llama_context_params params) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
llama_context * ctx = new llama_context;
|
llama_model * model = new llama_model;
|
||||||
|
|
||||||
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||||
|
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||||
|
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||||
|
delete model;
|
||||||
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return model;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_free_model(struct llama_model * model) {
|
||||||
|
delete model;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct llama_context * llama_new_context_with_model(
|
||||||
|
struct llama_model * model,
|
||||||
|
struct llama_context_params params) {
|
||||||
|
|
||||||
|
if (!model) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * ctx = new llama_context(*model, model->vocab);
|
||||||
|
|
||||||
if (params.seed < 0) {
|
if (params.seed < 0) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
|
@ -2667,24 +2714,16 @@ struct llama_context * llama_init_from_file(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
|
||||||
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
|
||||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
|
||||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// reserve memory for context buffers
|
// reserve memory for context buffers
|
||||||
if (!params.vocab_only) {
|
if (!params.vocab_only) {
|
||||||
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
||||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
||||||
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2737,7 +2776,7 @@ struct llama_context * llama_init_from_file(
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
||||||
|
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
||||||
|
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
||||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
||||||
|
@ -2748,7 +2787,23 @@ struct llama_context * llama_init_from_file(
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_context * llama_init_from_file(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_context_params params) {
|
||||||
|
|
||||||
|
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
||||||
|
if (!model) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
||||||
|
ctx->model_owner = true;
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_free(struct llama_context * ctx) {
|
void llama_free(struct llama_context * ctx) {
|
||||||
|
if (ctx->model_owner) {
|
||||||
|
delete &ctx->model;
|
||||||
|
}
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2765,11 +2820,9 @@ int llama_model_quantize(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||||
|
|
||||||
auto & model = ctx->model;
|
|
||||||
|
|
||||||
const int64_t t_start_lora_us = ggml_time_us();
|
const int64_t t_start_lora_us = ggml_time_us();
|
||||||
|
|
||||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||||
|
@ -3012,7 +3065,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
|
|
||||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
try {
|
try {
|
||||||
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
|
try {
|
||||||
|
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -3020,7 +3082,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||||
return ctx->model.kv_self.n;
|
return ctx->kv_self.n;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||||
|
@ -3045,7 +3107,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
||||||
const size_t s_kv_size = sizeof(size_t);
|
const size_t s_kv_size = sizeof(size_t);
|
||||||
const size_t s_kv_ntok = sizeof(int);
|
const size_t s_kv_ntok = sizeof(int);
|
||||||
const size_t s_kv = ctx->model.kv_self.buf.size;
|
const size_t s_kv = ctx->kv_self.buf.size;
|
||||||
|
|
||||||
const size_t s_total = (
|
const size_t s_total = (
|
||||||
+ s_rng_size
|
+ s_rng_size
|
||||||
|
@ -3111,7 +3173,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
|
|
||||||
// copy kv cache
|
// copy kv cache
|
||||||
{
|
{
|
||||||
const auto & kv_self = ctx->model.kv_self;
|
const auto & kv_self = ctx->kv_self;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
@ -3215,7 +3277,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
// set kv cache
|
// set kv cache
|
||||||
{
|
{
|
||||||
const auto & kv_self = ctx->model.kv_self;
|
const auto & kv_self = ctx->kv_self;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
@ -3259,7 +3321,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->model.kv_self.n = kv_ntok;
|
ctx->kv_self.n = kv_ntok;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t nread = inp - src;
|
const size_t nread = inp - src;
|
||||||
|
@ -3506,6 +3568,6 @@ const char * llama_print_system_info(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||||
return ctx->model.tensors_by_name;
|
return ctx->model.tensors_by_name;
|
||||||
}
|
}
|
||||||
|
|
50
llama.h
50
llama.h
|
@ -26,6 +26,14 @@
|
||||||
# define LLAMA_API
|
# define LLAMA_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
||||||
|
#else
|
||||||
|
# define DEPRECATED(func, hint) func
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||||
|
@ -53,6 +61,7 @@ extern "C" {
|
||||||
// TODO: show sample usage
|
// TODO: show sample usage
|
||||||
//
|
//
|
||||||
|
|
||||||
|
struct llama_model;
|
||||||
struct llama_context;
|
struct llama_context;
|
||||||
|
|
||||||
typedef int llama_token;
|
typedef int llama_token;
|
||||||
|
@ -72,27 +81,26 @@ extern "C" {
|
||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
|
int seed; // RNG seed, -1 for random
|
||||||
int n_ctx; // text context
|
int n_ctx; // text context
|
||||||
int n_batch; // prompt processing batch size
|
int n_batch; // prompt processing batch size
|
||||||
int n_gpu_layers; // number of layers to store in VRAM
|
int n_gpu_layers; // number of layers to store in VRAM
|
||||||
int main_gpu; // the GPU that is used for scratch and small tensors
|
int main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||||
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
int seed; // RNG seed, -1 for random
|
llama_progress_callback progress_callback;
|
||||||
|
// context pointer passed to the progress callback
|
||||||
|
void * progress_callback_user_data;
|
||||||
|
|
||||||
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||||
bool f16_kv; // use fp16 for KV cache
|
bool f16_kv; // use fp16 for KV cache
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
|
||||||
llama_progress_callback progress_callback;
|
|
||||||
// context pointer passed to the progress callback
|
|
||||||
void * progress_callback_user_data;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum llama_ftype {
|
enum llama_ftype {
|
||||||
LLAMA_FTYPE_ALL_F32 = 0,
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
@ -137,12 +145,23 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us();
|
LLAMA_API int64_t llama_time_us();
|
||||||
|
|
||||||
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_context_params params);
|
||||||
|
|
||||||
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
||||||
|
|
||||||
|
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||||
|
struct llama_model * model,
|
||||||
|
struct llama_context_params params);
|
||||||
|
|
||||||
// Various functions for loading a ggml llama model.
|
// Various functions for loading a ggml llama model.
|
||||||
// Allocate (almost) all memory needed for the model.
|
// Allocate (almost) all memory needed for the model.
|
||||||
// Return NULL on failure
|
// Return NULL on failure
|
||||||
LLAMA_API struct llama_context * llama_init_from_file(
|
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_context_params params);
|
struct llama_context_params params),
|
||||||
|
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
@ -159,8 +178,15 @@ extern "C" {
|
||||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||||
// will be applied on top of the previous one
|
// will be applied on top of the previous one
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API int llama_apply_lora_from_file(
|
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
|
const char * path_lora,
|
||||||
|
const char * path_base_model,
|
||||||
|
int n_threads),
|
||||||
|
"please use llama_model_apply_lora_from_file instead");
|
||||||
|
|
||||||
|
LLAMA_API int llama_model_apply_lora_from_file(
|
||||||
|
const struct llama_model * model,
|
||||||
const char * path_lora,
|
const char * path_lora,
|
||||||
const char * path_base_model,
|
const char * path_base_model,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
|
@ -311,7 +337,7 @@ extern "C" {
|
||||||
#include <string>
|
#include <string>
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
|
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
@ -5,6 +6,10 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MAX_NARGS 3
|
#define MAX_NARGS 3
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
|
@ -197,8 +202,23 @@ bool check_gradient(
|
||||||
float max_error_abs,
|
float max_error_abs,
|
||||||
float max_error_rel) {
|
float max_error_rel) {
|
||||||
|
|
||||||
|
static int n_threads = -1;
|
||||||
|
if (n_threads < 0) {
|
||||||
|
n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
|
||||||
|
const char *env = getenv("GGML_N_THREADS");
|
||||||
|
if (env) {
|
||||||
|
n_threads = atoi(env);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("GGML_N_THREADS = %d\n", n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward (f);
|
struct ggml_cgraph gf = ggml_build_forward (f);
|
||||||
|
gf.n_threads = n_threads;
|
||||||
|
|
||||||
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
||||||
|
gb.n_threads = n_threads;
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
ggml_graph_reset (&gf);
|
ggml_graph_reset (&gf);
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#define QK 32
|
#define QK 32
|
||||||
#define WARMUP 5
|
#define WARMUP 5
|
||||||
#define ITERATIONS 10
|
#define ITERATIONS 10
|
||||||
|
#define MAX_ITERATIONS 100000000
|
||||||
|
|
||||||
#define L1_SIZE 32*128
|
#define L1_SIZE 32*128
|
||||||
#define L2_SIZE 32*2048
|
#define L2_SIZE 32*2048
|
||||||
|
@ -36,9 +37,9 @@ struct quantize_perf_params {
|
||||||
bool op_dequantize_row_q = false;
|
bool op_dequantize_row_q = false;
|
||||||
bool op_quantize_row_q_dot = false;
|
bool op_quantize_row_q_dot = false;
|
||||||
bool op_vec_dot_q = false;
|
bool op_vec_dot_q = false;
|
||||||
|
int64_t iterations = ITERATIONS;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if defined(__x86_64__) || defined(__i386__)
|
#if defined(__x86_64__) || defined(__i386__)
|
||||||
|
|
||||||
#include <x86intrin.h>
|
#include <x86intrin.h>
|
||||||
|
@ -75,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) {
|
||||||
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)> function) {
|
void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) {
|
||||||
int64_t min_time_us = INT64_MAX;
|
int64_t min_time_us = INT64_MAX;
|
||||||
int64_t total_time_us = 0;
|
int64_t total_time_us = 0;
|
||||||
int64_t min_time_cycles = INT64_MAX;
|
int64_t min_time_cycles = INT64_MAX;
|
||||||
|
@ -86,7 +87,7 @@ void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)>
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i < ITERATIONS; i++) {
|
for (int i = 0; i < iterations; i++) {
|
||||||
const int64_t start_time = ggml_time_us();
|
const int64_t start_time = ggml_time_us();
|
||||||
const int64_t start_cycles = cpu_cycles();
|
const int64_t start_cycles = cpu_cycles();
|
||||||
|
|
||||||
|
@ -102,9 +103,38 @@ void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)>
|
||||||
}
|
}
|
||||||
|
|
||||||
printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
|
printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
|
||||||
printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * ITERATIONS));
|
printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations));
|
||||||
printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * ITERATIONS, total_time_us));
|
printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * iterations, total_time_us));
|
||||||
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * ITERATIONS, total_time_us));
|
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
|
||||||
|
}
|
||||||
|
|
||||||
|
void usage(char * argv[]) {
|
||||||
|
printf("Benchmark quantization specific functions on synthetic data\n");
|
||||||
|
printf("\n");
|
||||||
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
printf("\n");
|
||||||
|
printf("options: (default)\n");
|
||||||
|
printf(" -h, --help show this help message and exit\n");
|
||||||
|
printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
|
||||||
|
printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
|
||||||
|
printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
|
||||||
|
printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
|
||||||
|
printf(" quantize_row_q_dot, vec_dot_q (all)\n");
|
||||||
|
printf(" --type TYPE set test type as");
|
||||||
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
|
ggml_type type = (ggml_type) i;
|
||||||
|
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
|
||||||
|
if (ggml_type_name(type) != NULL) {
|
||||||
|
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||||
|
printf(" %s", ggml_type_name(type));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(" (all)\n");
|
||||||
|
printf(" --alignment-offset OFFSET\n");
|
||||||
|
printf(" set alignment offset as OFFSET (0)\n");
|
||||||
|
printf(" -i NUM, --iterations NUM\n");
|
||||||
|
printf(" set test iteration number (%d)\n", ITERATIONS);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
|
@ -178,6 +208,21 @@ int main(int argc, char * argv[]) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.alignment_offset = alignment;
|
params.alignment_offset = alignment;
|
||||||
|
} else if ((arg == "-i") || (arg == "--iterations")) {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
int number = std::stoi(argv[i]);
|
||||||
|
if (number < 0 || number > MAX_ITERATIONS) {
|
||||||
|
fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.iterations = number;
|
||||||
|
} else if ((arg == "-h") || (arg == "--help")) {
|
||||||
|
usage(argv);
|
||||||
|
return 1;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -213,6 +258,8 @@ int main(int argc, char * argv[]) {
|
||||||
generate_data(0, largest, test_data1);
|
generate_data(0, largest, test_data1);
|
||||||
generate_data(1, largest, test_data2);
|
generate_data(1, largest, test_data2);
|
||||||
|
|
||||||
|
int64_t iterations = params.iterations;
|
||||||
|
|
||||||
|
|
||||||
// Initialize GGML, ensures float conversion tables are initialized
|
// Initialize GGML, ensures float conversion tables are initialized
|
||||||
struct ggml_init_params ggml_params = {
|
struct ggml_init_params ggml_params = {
|
||||||
|
@ -225,7 +272,7 @@ int main(int argc, char * argv[]) {
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -241,7 +288,7 @@ int main(int argc, char * argv[]) {
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
benchmark_function(size, quantized_size, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
@ -255,7 +302,7 @@ int main(int argc, char * argv[]) {
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
benchmark_function(size, quantized_size, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
@ -270,7 +317,7 @@ int main(int argc, char * argv[]) {
|
||||||
return test_out[0];
|
return test_out[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
benchmark_function(size, quantized_size, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
@ -284,7 +331,7 @@ int main(int argc, char * argv[]) {
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
benchmark_function(size, quantized_size, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
@ -301,7 +348,7 @@ int main(int argc, char * argv[]) {
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
benchmark_function(size, quantized_size, quantize_fn);
|
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -181,6 +181,7 @@ int main(void) {
|
||||||
|
|
||||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
|
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
|
||||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
|
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
|
||||||
|
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
|
||||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
|
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
|
||||||
|
|
||||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
|
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
|
||||||
|
|
|
@ -28,6 +28,7 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
// load the vocab
|
// load the vocab
|
||||||
|
@ -36,10 +37,18 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
lparams.vocab_only = true;
|
lparams.vocab_only = true;
|
||||||
|
|
||||||
ctx = llama_init_from_file(fname.c_str(), lparams);
|
model = llama_load_model_from_file(fname.c_str(), lparams);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx = llama_new_context_with_model(model, lparams);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -48,6 +57,8 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
if (n_vocab != 32000) {
|
if (n_vocab != 32000) {
|
||||||
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,10 +88,13 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
|
llama_free(ctx);
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue