Merge branch 'ggerganov:master' into master

This commit is contained in:
ymcki 2024-12-28 22:10:52 +08:00 committed by GitHub
commit f9a1cdb3a7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 1135 additions and 768 deletions

View file

@ -684,6 +684,9 @@ class Model:
if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
# ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
res = "gigachat"
if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
# ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
res = "megrez"
if res is None:
logger.warning("\n")

View file

@ -106,6 +106,7 @@ models = [
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
]

View file

@ -15,7 +15,7 @@ static void run(
for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
ggml_format_name(ctrl_out, "direction.%zu", il+1);
// calculate mean vector
struct ggml_tensor * t_layer = v_input[il];

View file

@ -302,7 +302,7 @@ static void run_pca(
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
ggml_format_name(ctrl_out, "direction.%zu", il+1);
// run power_iteration
params.i_layer = il;

View file

@ -265,8 +265,8 @@ struct lora_merge_ctx {
fout.write((const char *)data.data(), data.size());
}
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
}
void copy_tensor(struct ggml_tensor * base) {
@ -352,7 +352,7 @@ struct lora_merge_ctx {
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
delta = ggml_scale(ctx0, delta, scale);
cur = ggml_add(ctx0, delta, cur);
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
}
cur = ggml_cast(ctx0, cur, out->type);

View file

@ -12,6 +12,10 @@
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#include "ggml-rpc.h"
#ifdef _WIN32
# include <windows.h>
@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
if (!backend) {
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
}
#elif GGML_USE_SYCL
fprintf(stderr, "%s: using SYCL backend\n", __func__);
backend = ggml_backend_sycl_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
#elif GGML_USE_VULKAN
ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
#elif GGML_USE_SYCL
ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
#else
#ifdef _WIN32
MEMORYSTATUSEX status;

View file

@ -19,6 +19,8 @@ Options:
Context size (default: 2048)
-n, --ngl <value>
Number of GPU layers (default: 0)
--temp <value>
Temperature (default: 0.8)
-v, --verbose, --log-verbose
Set verbosity level to infinity (i.e. log all messages, useful for debugging)
-h, --help

View file

@ -55,29 +55,51 @@ static int printe(const char * fmt, ...) {
class Opt {
public:
int init(int argc, const char ** argv) {
ctx_params = llama_context_default_params();
model_params = llama_model_default_params();
context_size_default = ctx_params.n_batch;
ngl_default = model_params.n_gpu_layers;
common_params_sampling sampling;
temperature_default = sampling.temp;
if (argc < 2) {
printe("Error: No arguments provided.\n");
print_help();
return 1;
}
// Parse arguments
if (parse(argc, argv)) {
printe("Error: Failed to parse arguments.\n");
help();
print_help();
return 1;
}
// If help is requested, show help and exit
if (help_) {
help();
if (help) {
print_help();
return 2;
}
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
temperature = temperature >= 0 ? temperature : temperature_default;
return 0; // Success
}
llama_context_params ctx_params;
llama_model_params model_params;
std::string model_;
std::string user_;
int context_size_ = -1, ngl_ = -1;
bool verbose_ = false;
std::string user;
int context_size = -1, ngl = -1;
float temperature = -1;
bool verbose = false;
private:
bool help_ = false;
int context_size_default = -1, ngl_default = -1;
float temperature_default = -1;
bool help = false;
bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
@ -89,6 +111,17 @@ class Opt {
}
option_value = std::atoi(argv[++i]);
return 0;
}
int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) {
if (i + 1 >= argc) {
return 1;
}
option_value = std::atof(argv[++i]);
return 0;
}
@ -96,18 +129,22 @@ class Opt {
bool options_parsing = true;
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
if (handle_option_with_value(argc, argv, i, context_size_) == 1) {
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
return 1;
}
} else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
if (handle_option_with_value(argc, argv, i, ngl_) == 1) {
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
return 1;
}
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
return 1;
}
} else if (options_parsing &&
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
verbose_ = true;
verbose = true;
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
help_ = true;
help = true;
return 0;
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
options_parsing = false;
@ -120,16 +157,16 @@ class Opt {
model_ = argv[i];
} else if (positional_args_i == 1) {
++positional_args_i;
user_ = argv[i];
user = argv[i];
} else {
user_ += " " + std::string(argv[i]);
user += " " + std::string(argv[i]);
}
}
return 0;
}
void help() const {
void print_help() const {
printf(
"Description:\n"
" Runs a llm\n"
@ -142,6 +179,8 @@ class Opt {
" Context size (default: %d)\n"
" -n, --ngl <value>\n"
" Number of GPU layers (default: %d)\n"
" --temp <value>\n"
" Temperature (default: %.1f)\n"
" -v, --verbose, --log-verbose\n"
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
" -h, --help\n"
@ -170,7 +209,7 @@ class Opt {
" llama-run file://some-file3.gguf\n"
" llama-run --ngl 999 some-file4.gguf\n"
" llama-run --ngl 999 some-file5.gguf Hello World\n",
llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
context_size_default, ngl_default, temperature_default);
}
};
@ -495,12 +534,12 @@ class LlamaData {
return 1;
}
context = initialize_context(model, opt.context_size_);
context = initialize_context(model, opt);
if (!context) {
return 1;
}
sampler = initialize_sampler();
sampler = initialize_sampler(opt);
return 0;
}
@ -619,14 +658,12 @@ class LlamaData {
// Initializes the model and returns a unique pointer to it
llama_model_ptr initialize_model(Opt & opt) {
ggml_backend_load_all();
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
resolve_model(opt.model_);
printe(
"\r%*s"
"\rLoading model",
get_terminal_width(), " ");
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params));
if (!model) {
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
}
@ -636,10 +673,8 @@ class LlamaData {
}
// Initializes the context with the specified parameters
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params));
if (!context) {
printe("%s: error: failed to create the llama_context\n", __func__);
}
@ -648,10 +683,10 @@ class LlamaData {
}
// Initializes and configures the sampler
llama_sampler_ptr initialize_sampler() {
llama_sampler_ptr initialize_sampler(const Opt & opt) {
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
return sampler;
@ -798,9 +833,9 @@ static int apply_chat_template_with_error_handling(LlamaData & llama_data, const
}
// Helper function to handle user input
static int handle_user_input(std::string & user_input, const std::string & user_) {
if (!user_.empty()) {
user_input = user_;
static int handle_user_input(std::string & user_input, const std::string & user) {
if (!user.empty()) {
user_input = user;
return 0; // No need for interactive input
}
@ -832,17 +867,17 @@ static bool is_stdout_a_terminal() {
}
// Function to tokenize the prompt
static int chat_loop(LlamaData & llama_data, const std::string & user_) {
static int chat_loop(LlamaData & llama_data, const std::string & user) {
int prev_len = 0;
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
static const bool stdout_a_terminal = is_stdout_a_terminal();
while (true) {
// Get user input
std::string user_input;
while (handle_user_input(user_input, user_)) {
while (handle_user_input(user_input, user)) {
}
add_message("user", user_.empty() ? user_input : user_, llama_data);
add_message("user", user.empty() ? user_input : user, llama_data);
int new_len;
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
return 1;
@ -854,7 +889,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
return 1;
}
if (!user_.empty()) {
if (!user.empty()) {
break;
}
@ -869,7 +904,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
const Opt * opt = static_cast<Opt *>(p);
if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) {
if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) {
printe("%s", text);
}
}
@ -890,11 +925,11 @@ int main(int argc, const char ** argv) {
}
if (!is_stdin_a_terminal()) {
if (!opt.user_.empty()) {
opt.user_ += "\n\n";
if (!opt.user.empty()) {
opt.user += "\n\n";
}
opt.user_ += read_pipe_data();
opt.user += read_pipe_data();
}
llama_log_set(log_callback, &opt);
@ -903,7 +938,7 @@ int main(int argc, const char ** argv) {
return 1;
}
if (chat_loop(llama_data, opt.user_)) {
if (chat_loop(llama_data, opt.user)) {
return 1;
}

View file

@ -34,6 +34,7 @@ endforeach()
add_executable(${TARGET} ${TARGET_SRCS})
install(TARGETS ${TARGET} RUNTIME)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
if (LLAMA_SERVER_SSL)

View file

@ -450,6 +450,8 @@ These words will not be included in the completion, so make sure to add them to
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error.
**Response format**
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
@ -724,7 +726,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
},
"total_slots": 1,
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
"chat_template": "..."
"chat_template": "...",
"build_info": "b(build number)-(build commit hash)"
}
```

View file

@ -92,6 +92,7 @@ struct slot_params {
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
bool timings_per_token = false;
bool post_sampling_probs = false;
bool ignore_eos = false;
@ -209,6 +210,7 @@ struct server_task {
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
@ -522,6 +524,7 @@ struct server_task_result_cmpl_final : server_task_result {
bool post_sampling_probs;
std::vector<completion_token_output> probs_output;
std::vector<std::string> response_fields;
slot_params generation_params;
@ -568,7 +571,7 @@ struct server_task_result_cmpl_final : server_task_result {
if (!stream && !probs_output.empty()) {
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
}
return res;
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
}
json to_json_oaicompat_chat() {
@ -598,6 +601,7 @@ struct server_task_result_cmpl_final : server_task_result {
{"choices", json::array({choice})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "chat.completion"},
{"usage", json {
{"completion_tokens", n_decoded},
@ -636,6 +640,7 @@ struct server_task_result_cmpl_final : server_task_result {
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "chat.completion.chunk"},
{"usage", json {
{"completion_tokens", n_decoded},
@ -765,6 +770,7 @@ struct server_task_result_cmpl_partial : server_task_result {
{"created", t},
{"id", oaicompat_cmpl_id},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "chat.completion.chunk"}
};
@ -2063,6 +2069,7 @@ struct server_context {
res->tokens = slot.generated_tokens;
res->timings = slot.get_timings();
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
res->response_fields = slot.params.response_fields;
res->truncated = slot.truncated;
res->n_decoded = slot.n_decoded;
@ -3476,6 +3483,7 @@ int main(int argc, char ** argv) {
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model },
{ "chat_template", llama_get_chat_template(ctx_server.model) },
{ "build_info", build_info },
};
res_ok(res, data);
@ -3697,7 +3705,7 @@ int main(int argc, char ** argv) {
{"object", "list"},
{"data", {
{
{"id", params.model_alias},
{"id", params.model_alias.empty() ? params.model : params.model_alias},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
@ -3782,6 +3790,17 @@ int main(int argc, char ** argv) {
return;
}
bool use_base64 = false;
if (body.count("encoding_format") != 0) {
const std::string& format = body.at("encoding_format");
if (format == "base64") {
use_base64 = true;
} else if (format != "float") {
res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
return;
}
}
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
for (const auto & tokens : tokenized_prompts) {
// this check is necessary for models that do not add BOS token to the input
@ -3833,7 +3852,7 @@ int main(int argc, char ** argv) {
}
// write JSON response
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : json(responses);
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses);
res_ok(res, root);
};

View file

@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
})
assert res.status_code == 200
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
assert res.body["system_fingerprint"].startswith("b")
assert res.body["model"] == model if model is not None else server.model_alias
assert res.body["usage"]["prompt_tokens"] == n_prompt
assert res.body["usage"]["completion_tokens"] == n_predicted
@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
last_cmpl_id = None
for data in res:
choice = data["choices"][0]
assert data["system_fingerprint"].startswith("b")
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
if last_cmpl_id is None:
last_cmpl_id = data["id"]
@ -92,6 +94,7 @@ def test_chat_completion_with_openai_library():
seed=42,
temperature=0.8,
)
assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
assert res.choices[0].finish_reason == "length"
assert res.choices[0].message.content is not None
assert match_regex("(Suddenly)+", res.choices[0].message.content)

View file

@ -95,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int):
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"seed": 42,
"temperature": 1.0,
"temperature": 0.0,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
})
if last_res is not None:
@ -120,9 +120,10 @@ def test_different_result_different_seed(n_slots: int):
assert res.body["content"] != last_res.body["content"]
last_res = res
# TODO figure why it don't work with temperature = 1
# @pytest.mark.parametrize("temperature", [0.0, 1.0])
@pytest.mark.parametrize("n_batch", [16, 32])
@pytest.mark.parametrize("temperature", [0.0, 1.0])
@pytest.mark.parametrize("temperature", [0.0])
def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
global server
server.n_batch = n_batch
@ -257,6 +258,40 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
# assert match_regex(re_content, res.body["content"])
@pytest.mark.parametrize(
"prompt,n_predict,response_fields",
[
("I believe the meaning of life is", 8, []),
("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
],
)
def test_completion_response_fields(
prompt: str, n_predict: int, response_fields: list[str]
):
global server
server.start()
res = server.make_request(
"POST",
"/completion",
data={
"n_predict": n_predict,
"prompt": prompt,
"response_fields": response_fields,
},
)
assert res.status_code == 200
assert "content" in res.body
assert len(res.body["content"])
if len(response_fields):
assert res.body["generation_settings/n_predict"] == n_predict
assert res.body["prompt"] == "<s> " + prompt
assert isinstance(res.body["content"], str)
assert len(res.body) == len(response_fields)
else:
assert len(res.body)
assert "generation_settings" in res.body
def test_n_probs():
global server
server.start()

View file

@ -1,3 +1,5 @@
import base64
import struct
import pytest
from openai import OpenAI
from utils import *
@ -194,3 +196,42 @@ def test_embedding_usage_multiple():
assert res.status_code == 200
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
assert res.body['usage']['prompt_tokens'] == 2 * 9
def test_embedding_openai_library_base64():
server.start()
test_input = "Test base64 embedding output"
# get embedding in default format
res = server.make_request("POST", "/v1/embeddings", data={
"input": test_input
})
assert res.status_code == 200
vec0 = res.body["data"][0]["embedding"]
# get embedding in base64 format
res = server.make_request("POST", "/v1/embeddings", data={
"input": test_input,
"encoding_format": "base64"
})
assert res.status_code == 200
assert "data" in res.body
assert len(res.body["data"]) == 1
embedding_data = res.body["data"][0]
assert "embedding" in embedding_data
assert isinstance(embedding_data["embedding"], str)
# Verify embedding is valid base64
decoded = base64.b64decode(embedding_data["embedding"])
# Verify decoded data can be converted back to float array
float_count = len(decoded) // 4 # 4 bytes per float
floats = struct.unpack(f'{float_count}f', decoded)
assert len(floats) > 0
assert all(isinstance(x, float) for x in floats)
assert len(floats) == len(vec0)
# make sure the decoded data is the same as the original
for x, y in zip(floats, vec0):
assert abs(x - y) < EPSILON

View file

@ -3,6 +3,7 @@
#include "common.h"
#include "log.h"
#include "llama.h"
#include "common/base64.hpp"
#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
@ -56,6 +57,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
}
}
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
//
// tokenizer and input processing utils
//
@ -88,6 +91,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
return false;
}
// get value by path(key1 / key2)
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
json result = json::object();
for (const std::string & path : paths) {
json current = js;
const auto keys = string_split<std::string>(path, /*separator*/ '/');
bool valid_path = true;
for (const std::string & k : keys) {
if (valid_path && current.is_object() && current.contains(k)) {
current = current[k];
} else {
valid_path = false;
}
}
if (valid_path) {
result[path] = current;
}
}
return result;
}
/**
* this handles 2 cases:
* - only string, example: "string"
@ -589,16 +614,31 @@ static json oaicompat_completion_params_parse(
return llama_params;
}
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
json data = json::array();
int32_t n_tokens = 0;
int i = 0;
for (const auto & elem : embeddings) {
data.push_back(json{
json embedding_obj;
if (use_base64) {
const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
const char* data_ptr = reinterpret_cast<const char*>(vec.data());
size_t data_size = vec.size() * sizeof(float);
embedding_obj = {
{"embedding", base64::encode(data_ptr, data_size)},
{"index", i++},
{"object", "embedding"},
{"encoding_format", "base64"}
};
} else {
embedding_obj = {
{"embedding", json_value(elem, "embedding", json::array())},
{"index", i++},
{"object", "embedding"}
});
};
}
data.push_back(embedding_obj);
n_tokens += json_value(elem, "tokens_evaluated", 0);
}

View file

@ -234,6 +234,7 @@ function(ggml_add_backend_library backend)
# write the shared library to the output directory
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
add_dependencies(ggml ${backend})
else()
add_library(${backend} ${ARGN})
target_link_libraries(ggml PUBLIC ${backend})

View file

@ -66,6 +66,26 @@
#include "ggml-kompute.h"
#endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
static std::wstring utf8_to_utf16(const std::string & str) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(str);
}
static std::string utf16_to_utf8(const std::wstring & str) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.to_bytes(str);
}
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
#ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>;
@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
return handle;
}
static dl_handle * dl_load_library(const std::string & path) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return dl_load_library(converter.from_bytes(path));
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@ -114,8 +129,8 @@ struct dl_handle_deleter {
}
};
static void * dl_load_library(const std::string & path) {
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
static void * dl_load_library(const std::wstring & path) {
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
return handle;
}
@ -202,11 +217,11 @@ struct ggml_backend_registry {
devices.push_back(device);
}
ggml_backend_reg_t load_backend(const char * path, bool silent) {
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
dl_handle_ptr handle { dl_load_library(path) };
if (!handle) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
}
return nullptr;
}
@ -214,7 +229,7 @@ struct ggml_backend_registry {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn && score_fn() == 0) {
if (!silent) {
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
}
return nullptr;
}
@ -222,7 +237,7 @@ struct ggml_backend_registry {
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
if (!backend_init_fn) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
}
return nullptr;
}
@ -231,16 +246,16 @@ struct ggml_backend_registry {
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) {
if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
} else {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
}
}
return nullptr;
}
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
register_backend(reg, std::move(handle));
@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
// Dynamic loading
ggml_backend_reg_t ggml_backend_load(const char * path) {
return get_reg().load_backend(path, false);
return get_reg().load_backend(utf8_to_utf16(path), false);
}
void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true);
}
static std::string get_executable_path() {
static std::wstring get_executable_path() {
#if defined(__APPLE__)
// get executable path
std::vector<char> path;
@ -401,13 +416,17 @@ static std::string get_executable_path() {
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
return base_path + "/";
#elif defined(__linux__)
return utf8_to_utf16(base_path + "/");
#elif defined(__linux__) || defined(__FreeBSD__)
std::string base_path = ".";
std::vector<char> path(1024);
while (true) {
// get executable path
# if defined(__linux__)
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
# elif defined(__FreeBSD__)
ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
# endif
if (len == -1) {
break;
}
@ -423,57 +442,63 @@ static std::string get_executable_path() {
path.resize(path.size() * 2);
}
return base_path + "/";
return utf8_to_utf16(base_path + "/");
#elif defined(_WIN32)
std::vector<char> path(MAX_PATH);
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
std::vector<wchar_t> path(MAX_PATH);
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
if (len == 0) {
return "";
return {};
}
std::string base_path(path.data(), len);
std::wstring base_path(path.data(), len);
// remove executable name
auto last_slash = base_path.find_last_of('\\');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
return base_path + "\\";
return base_path + L"\\";
#else
return {};
#endif
}
static std::string backend_filename_prefix() {
static std::wstring backend_filename_prefix() {
#ifdef _WIN32
return "ggml-";
return L"ggml-";
#else
return "libggml-";
return L"libggml-";
#endif
}
static std::string backend_filename_suffix() {
static std::wstring backend_filename_suffix() {
#ifdef _WIN32
return ".dll";
return L".dll";
#else
return ".so";
return L".so";
#endif
}
static std::wstring path_separator() {
#ifdef _WIN32
return L"\\";
#else
return L"/";
#endif
}
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths
std::string file_prefix = backend_filename_prefix() + name + "-";
std::vector<std::string> search_paths;
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
std::vector<std::wstring> search_paths;
if (user_search_path == nullptr) {
search_paths.push_back("./");
search_paths.push_back(L"." + path_separator());
search_paths.push_back(get_executable_path());
} else {
#if defined(_WIN32)
search_paths.push_back(std::string(user_search_path) + "\\");
#else
search_paths.push_back(std::string(user_search_path) + "/");
#endif
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
}
int best_score = 0;
std::string best_path;
std::wstring best_path;
namespace fs = std::filesystem;
for (const auto & search_path : search_paths) {
@ -483,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string();
std::wstring filename = entry.path().filename().wstring();
std::wstring ext = entry.path().extension().wstring();
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
}
if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn) {
int s = score_fn();
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
#endif
if (s > best_score) {
best_score = s;
best_path = entry.path().string();
best_path = entry.path().wstring();
}
} else {
if (!silent) {
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
}
}
}
@ -515,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if (best_score == 0) {
// try to load the base backend
for (const auto & search_path : search_paths) {
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
if (fs::exists(path)) {
return get_reg().load_backend(path.c_str(), silent);
return get_reg().load_backend(path, silent);
}
}
return nullptr;
}
return get_reg().load_backend(best_path.c_str(), silent);
return get_reg().load_backend(best_path, silent);
}
void ggml_backend_load_all() {

View file

@ -135,14 +135,20 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
# show enabled features
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
set(FEAT_INPUT_FILE "NUL")
else()
set(FEAT_INPUT_FILE "/dev/null")
endif()
execute_process(
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
INPUT_FILE "/dev/null"
INPUT_FILE ${FEAT_INPUT_FILE}
OUTPUT_VARIABLE ARM_FEATURE
RESULT_VARIABLE ARM_FEATURE_RESULT
)
if (ARM_FEATURE_RESULT)
message(FATAL_ERROR "Failed to get ARM features")
message(WARNING "Failed to get ARM features")
else()
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
@ -317,6 +323,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
if (GGML_BACKEND_DL)
if (GGML_NATIVE)
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
endif()
# The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same

View file

@ -986,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
#define GGML_F16_STEP 32
#define GGML_F16_EPR 4
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
float tmp[4];
tmp[0] = GGML_FP16_TO_FP32(x[0]);
@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
if (!llamafile_sgemm(params,
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)src1->data + i12*nb12 + i13*nb13,
nb11/ggml_type_size(src1->type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
src0->type,
src1->type,
dst->type))
@ -7471,14 +7471,14 @@ UseGgmlGemm1:;
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
if (!llamafile_sgemm(params,
ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
src0->type,
vec_dot_type,
dst->type))

View file

@ -53,6 +53,8 @@
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"
#include <atomic>
#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
#else
@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
return _mm512_fmadd_ps(a, b, c);
}
#endif
#if defined(__AVX512BF16__)
template <>
inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
return _mm512_dpbf16_ps(c, a, b);
}
template <>
inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
return _mm256_dpbf16_ps(c, a, b);
}
#endif
#endif
#if defined(__ARM_FEATURE_FMA)
@ -226,6 +238,13 @@ template <> inline __m256 load(const float *p) {
}
#endif // __AVX__
#if defined(__AVX2__) || defined(__AVX512F__)
template <> inline __m256 load(const ggml_bf16_t *p) {
return _mm256_castsi256_ps(
_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
}
#endif // __AVX2__
#if defined(__F16C__)
template <> inline __m256 load(const ggml_fp16_t *p) {
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
@ -239,8 +258,27 @@ template <> inline __m512 load(const float *p) {
template <> inline __m512 load(const ggml_fp16_t *p) {
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
}
template <> inline __m512 load(const ggml_bf16_t *p) {
return _mm512_castsi512_ps(
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
}
#endif // __AVX512F__
#if defined(__AVX512BF16__)
template <> inline __m512bh load(const ggml_bf16_t *p) {
return (__m512bh)_mm512_loadu_ps((const float *)p);
}
template <> inline __m256bh load(const ggml_bf16_t *p) {
return (__m256bh)_mm256_loadu_ps((const float *)p);
}
template <> inline __m512bh load(const float *p) {
return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
}
template <> inline __m256bh load(const float *p) {
return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// CONSTANTS
@ -252,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION
template <int M>
static inline int64_t BLOCK_SIZE(size_t m) {
const int64_t NB_BLOC_M = (m + M - 1) / M;
return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
}
static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
}
template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS {
public:
tinyBLAS(int64_t k,
tinyBLAS(const ggml_compute_params * params, int64_t k,
const TA *A, int64_t lda,
const TB *B, int64_t ldb,
TC *C, int64_t ldc,
int ith, int nth)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
TC *C, int64_t ldc)
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
}
void matmul(int64_t m, int64_t n) {
mnpack(0, m, 0, n);
bool matmul(int64_t m, int64_t n) {
if (k % KN != 0)
return false;
// compute RM for only need tile with size RM&RM-1
#if VECTOR_REGISTERS == 32
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
return true;
}
#else // VECTOR_REGISTERS == 16
if (m % 16 == 0 && (m/16 >= params->nth)) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
return true;
}
if (m % 8 == 0 ) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
return true;
}
if (m % 4 == 0) {
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
return true;
}
#endif
return false;
}
private:
NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t mc, nc, mp, np;
switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
#if VECTOR_REGISTERS == 32
case 0x55:
mc = 5;
nc = 5;
gemm<5, 5>(m0, m, n0, n);
break;
case 0x45:
mc = 4;
nc = 5;
gemm<4, 5>(m0, m, n0, n);
break;
case 0x54:
mc = 5;
nc = 4;
gemm<5, 4>(m0, m, n0, n);
break;
case 0x44:
mc = 4;
nc = 4;
gemm<4, 4>(m0, m, n0, n);
break;
case 0x53:
mc = 5;
nc = 3;
gemm<5, 3>(m0, m, n0, n);
break;
case 0x35:
mc = 3;
nc = 5;
gemm<3, 5>(m0, m, n0, n);
break;
case 0x43:
mc = 4;
nc = 3;
gemm<4, 3>(m0, m, n0, n);
break;
#else
case 0x55:
case 0x54:
case 0x53:
case 0x45:
case 0x44:
case 0x43:
mc = 4;
nc = 3;
gemm<4, 3>(m0, m, n0, n);
break;
case 0x35:
#endif
case 0x34:
mc = 3;
nc = 4;
gemm<3, 4>(m0, m, n0, n);
break;
case 0x52:
mc = 5;
nc = 2;
gemm<5, 2>(m0, m, n0, n);
break;
case 0x33:
mc = 3;
nc = 3;
gemm<3, 3>(m0, m, n0, n);
break;
case 0x25:
mc = 2;
nc = 5;
gemm<2, 5>(m0, m, n0, n);
break;
case 0x42:
mc = 4;
nc = 2;
gemm<4, 2>(m0, m, n0, n);
break;
case 0x24:
mc = 2;
nc = 4;
gemm<2, 4>(m0, m, n0, n);
break;
case 0x32:
mc = 3;
nc = 2;
gemm<3, 2>(m0, m, n0, n);
break;
case 0x23:
mc = 2;
nc = 3;
gemm<2, 3>(m0, m, n0, n);
break;
case 0x51:
mc = 5;
nc = 1;
gemm<5, 1>(m0, m, n0, n);
break;
case 0x41:
mc = 4;
nc = 1;
gemm<4, 1>(m0, m, n0, n);
break;
case 0x22:
mc = 2;
nc = 2;
gemm<2, 2>(m0, m, n0, n);
break;
case 0x15:
mc = 1;
nc = 5;
gemm<1, 5>(m0, m, n0, n);
break;
case 0x14:
mc = 1;
nc = 4;
gemm<1, 4>(m0, m, n0, n);
break;
case 0x31:
mc = 3;
nc = 1;
gemm<3, 1>(m0, m, n0, n);
break;
case 0x13:
mc = 1;
nc = 3;
gemm<1, 3>(m0, m, n0, n);
break;
case 0x21:
mc = 2;
nc = 1;
gemm<2, 1>(m0, m, n0, n);
break;
case 0x12:
mc = 1;
nc = 2;
gemm<1, 2>(m0, m, n0, n);
break;
case 0x11:
mc = 1;
nc = 1;
gemm<1, 1>(m0, m, n0, n);
break;
default:
return;
template <int RM, int RN, int BM>
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
if (SIZE_N == RN) {
return gemm<RM, RN, BM>(m, n, BN);
}
if constexpr (RN > 1) {
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
} else {
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
GGML_ASSERT(false); // we have miss something.
}
mp = m0 + (m - m0) / mc * mc;
np = n0 + (n - n0) / nc * nc;
mnpack(mp, m, n0, np);
mnpack(m0, m, np, n);
}
template <int RM, int RN>
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
int64_t ytiles = (m - m0) / RM;
int64_t xtiles = (n - n0) / RN;
int64_t tiles = xtiles * ytiles;
int64_t duty = (tiles + nth - 1) / nth;
int64_t start = duty * ith;
int64_t end = start + duty;
if (end > tiles)
end = tiles;
for (int64_t job = start; job < end; ++job) {
int64_t ii = m0 + job / xtiles * RM;
int64_t jj = n0 + job % xtiles * RN;
inline void gemm_bloc(int64_t ii, int64_t jj) {
D Cv[RN][RM] = {};
for (int64_t l = 0; l < k; l += KN)
for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < RM; ++i)
Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
load<V>(B + ldb * (jj + j) + l),
Cv[j][i]);
for (int64_t l = 0; l < k; l += KN) {
// help compiler for op order.
if constexpr (RM <= RN) {
V Av[RM];
for (int64_t i = 0; i < RM; ++i) {
Av[i] = load<V>(A + lda * (ii + i) + l);
}
for (int64_t j = 0; j < RN; ++j) {
V Bv = load<V>(B + ldb * (jj + j) + l);
for (int64_t i = 0; i < RM; ++i) {
Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
}
}
} else {
V Bv[RN];
for (int64_t j = 0; j < RN; ++j) {
Bv[j] = load<V>(B + ldb * (jj + j) + l);
}
for (int64_t i = 0; i < RM; ++i) {
V Av = load<V>(A + lda * (ii + i) + l);
for (int64_t j = 0; j < RN; ++j) {
Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
}
}
}
}
for (int64_t j = 0; j < RN; ++j)
for (int64_t i = 0; i < RM; ++i)
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
}
template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
static std::atomic<int64_t> current_chunk;
GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
// "round" bloc_size to "nearest" BN
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
const int64_t nb_job = ytiles * NB_BN;
if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
}
ggml_barrier(params->threadpool);
int64_t job = params->ith;
while (job < nb_job) {
const int64_t ii = (job % ytiles) * RM * BM;
const int64_t jb = job / ytiles;
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
int64_t jj = jj0;
for (; jj < jj1; jj += RN) {
gemm_bloc<RM, RN>(ii + bi, jj);
}
if constexpr (RN > 1) {
for (; jj < jj2; jj += RN - 1) {
gemm_bloc<RM, RN-1>(ii + bi, jj);
}
}
GGML_ASSERT(jj == jj2);
}
// next step.
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
}
ggml_barrier(params->threadpool);
return;
}
const ggml_compute_params * params;
const TA *const A;
const TB *const B;
TC *const C;
@ -452,8 +461,6 @@ class tinyBLAS {
const int64_t lda;
const int64_t ldb;
const int64_t ldc;
const int ith;
const int nth;
};
//////////////////////////////////////////////////////////////////////////////////////////
@ -1657,8 +1664,9 @@ class tinyBLAS_PPC {
* @param Ctype is GGML data type of `C`
* @return true if this function was able to service the matmul request
*/
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int Atype, int Btype, int Ctype) {
assert(m >= 0);
assert(n >= 0);
@ -1666,8 +1674,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
assert(lda >= k);
assert(ldb >= k);
assert(ldc >= m);
assert(nth > 0);
assert(ith < nth);
assert(params->nth > 0);
assert(params->ith < params->nth);
// only enable sgemm for prompt processing
if (n < 2)
@ -1682,37 +1690,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
if (Btype != GGML_TYPE_F32)
return false;
#if defined(__AVX512F__)
if (k % 16)
return false;
tinyBLAS<16, __m512, __m512, float, float, float> tb{
tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__AVX__) || defined(__AVX2__)
if (k % 8)
return false;
tinyBLAS<8, __m256, __m256, float, float, float> tb{
tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__ARM_NEON)
if (n < 4)
return false;
if (k % 4)
return false;
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
#elif defined(__MMA__)
if (k % 8)
return false;
@ -1720,7 +1716,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const float *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1728,60 +1724,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
#endif
}
case GGML_TYPE_BF16: {
#if defined(__AVX512BF16__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__AVX512F__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__AVX2__)
if (Btype == GGML_TYPE_BF16) {
tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
(const ggml_bf16_t *)A, lda,
(const ggml_bf16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#endif
return false;
}
case GGML_TYPE_F16: {
#if defined(__AVX512F__)
if (k % 16)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
if (Btype == GGML_TYPE_F16) {
tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
(const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
if (k % 8)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
if (Btype == GGML_TYPE_F16) {
tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
(const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
if (n < 8)
return false;
if (k % 8)
return false;
if (Btype != GGML_TYPE_F16)
return false;
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
if (Btype == GGML_TYPE_F16) {
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const ggml_fp16_t *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
(float *)C, ldc};
return tb.matmul(m, n);
}
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
if (k % 4)
return false;
if (Btype != GGML_TYPE_F32)
return false;
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
if (Btype == GGML_TYPE_F32) {
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
k, (const ggml_fp16_t *)A, lda,
(const float *)B, ldb,
(float *)C, ldc,
ith, nth};
tb.matmul(m, n);
return true;
#else
return false;
(float *)C, ldc};
return tb.matmul(m, n);
}
#endif
return false;
}
case GGML_TYPE_Q8_0: {
@ -1792,7 +1799,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__ARM_FEATURE_DOTPROD)
@ -1800,7 +1807,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q8_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1816,7 +1823,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#elif defined(__ARM_FEATURE_DOTPROD)
@ -1824,7 +1831,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q4_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1840,7 +1847,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_q5_0 *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1856,7 +1863,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
k, (const block_iq4_nl *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
ith, nth};
params->ith, params->nth};
tb.matmul(m, n);
return true;
#else
@ -1868,6 +1875,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
return false;
}
(void)params;
(void)m;
(void)n;
(void)k;
@ -1877,8 +1885,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
(void)ldb;
(void)C;
(void)ldc;
(void)ith;
(void)nth;
(void)Atype;
(void)Btype;
(void)Ctype;

View file

@ -5,8 +5,8 @@
extern "C" {
#endif
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
const void *, int64_t, void *, int64_t, int, int,
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
const void *, int64_t, const void *, int64_t, void *, int64_t,
int, int, int);
#ifdef __cplusplus

View file

@ -1855,53 +1855,58 @@ static void ggml_vk_load_shaders(vk_device& device) {
// mul mat vec
// AMD GCN and Intel graphics cards perform best when the number of rows per shader is doubled
uint32_t rm = 1;
if ((device->vendor_id == VK_VENDOR_ID_AMD && device->subgroup_min_size == 64 && device->subgroup_max_size == 64) || device->vendor_id == VK_VENDOR_ID_INTEL)
rm = 2;
// the number of rows computed per shader depends on GPU model and quant
uint32_t rm_stdq = 1;
uint32_t rm_kq = 2;
if (device->vendor_id == VK_VENDOR_ID_AMD) {
if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN
rm_stdq = 2;
rm_kq = 4;
}
} else if (device->vendor_id == VK_VENDOR_ID_INTEL)
rm_stdq = 2;
// computing additional rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
// dequant shaders
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);

View file

@ -6,21 +6,15 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (constant_id = 1) const uint NUM_ROWS = 1;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
if (row >= p.stride_d) {
return;
}
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
@ -38,15 +32,15 @@ void main() {
const uint s_offset = 8*v_im;
const uint y_offset = 128*v_im + l0;
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
FLOAT_TYPE temp[NUM_ROWS];
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[i] = FLOAT_TYPE(0);
}
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;
f16vec2 d = data_a[ib0 + i].d;
const FLOAT_TYPE dall = d.x;
const FLOAT_TYPE dmin = d.y;
B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
@ -56,6 +50,12 @@ void main() {
B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
f16vec2 d = data_a[ib0 + i].d;
const FLOAT_TYPE dall = d.x;
const FLOAT_TYPE dmin = d.y;
uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
@ -94,20 +94,40 @@ void main() {
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]),
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
}
temp = fma(dall, sum1, fma(-dmin, sum2, temp));
temp[n] = fma(dall, sum1, fma(-dmin, sum2, temp[n]));
}
}
tmp[gl_LocalInvocationID.x] = temp;
// sum up partial sums and write back result
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] = temp[n];
}
barrier();
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] += tmpsh[n][tid + s];
}
}
barrier();
}
if (tid == 0) {
data_d[d_offset + row] = D_TYPE(tmp[0]);
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
}
}
}
void main() {
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
// do NUM_ROWS at a time, unless there aren't enough remaining rows
if (first_row + NUM_ROWS <= p.stride_d) {
compute_outputs(first_row, NUM_ROWS);
} else {
if (first_row >= p.stride_d) {
return;
}
compute_outputs(first_row, p.stride_d - first_row);
}
}

View file

@ -6,21 +6,15 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (constant_id = 1) const uint NUM_ROWS = 1;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
if (row >= p.stride_d) {
return;
}
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
@ -39,15 +33,17 @@ void main() {
const uint q_offset = 32*v_im + l0;
const uint y_offset = 128*v_im + l0;
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
FLOAT_TYPE temp[NUM_ROWS];
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[i] = FLOAT_TYPE(0);
}
const uint s_shift = 4 * v_im;
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
@ -57,6 +53,10 @@ void main() {
B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
@ -81,20 +81,40 @@ void main() {
fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
}
temp = fma(d, sum, temp);
temp[n] = fma(d, sum, temp[n]);
}
}
tmp[gl_LocalInvocationID.x] = temp;
// sum up partial sums and write back result
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] = temp[n];
}
barrier();
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] += tmpsh[n][tid + s];
}
}
barrier();
}
if (tid == 0) {
data_d[d_offset + row] = D_TYPE(tmp[0]);
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
}
}
}
void main() {
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
// do NUM_ROWS at a time, unless there aren't enough remaining rows
if (first_row + NUM_ROWS <= p.stride_d) {
compute_outputs(first_row, NUM_ROWS);
} else {
if (first_row >= p.stride_d) {
return;
}
compute_outputs(first_row, p.stride_d - first_row);
}
}

View file

@ -7,21 +7,15 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (constant_id = 1) const uint NUM_ROWS = 1;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
if (row >= p.stride_d) {
return;
}
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
@ -42,12 +36,23 @@ void main() {
const uint q_offset = 32*v_im + l0;
const uint y_offset = 64*v_im + l0;
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
FLOAT_TYPE temp[NUM_ROWS];
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[i] = FLOAT_TYPE(0);
}
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y1_idx = i * QUANT_K + y_offset;
const uint y2_idx = y1_idx + 128;
B_TYPE_VEC4 by10 = data_b_v4[(b_offset + y1_idx) / 4];
B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8];
B_TYPE_VEC4 by20 = data_b_v4[(b_offset + y2_idx) / 4];
B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
f16vec2 d = data_a[ib0 + i].d;
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
@ -98,11 +103,6 @@ void main() {
const uint32_t q4_14 = qs64_hi4.z;
const uint32_t q4_15 = qs64_hi4.w;
B_TYPE_VEC4 by10 = data_b_v4[(b_offset + y1_idx) / 4];
B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8];
B_TYPE_VEC4 by20 = data_b_v4[(b_offset + y2_idx) / 4];
B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8];
const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3)));
const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7)));
const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11)));
@ -112,20 +112,40 @@ void main() {
fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7)))))))))))))));
temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
temp[n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[n]));
}
}
tmp[gl_LocalInvocationID.x] = temp;
// sum up partial sums and write back result
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] = temp[n];
}
barrier();
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] += tmpsh[n][tid + s];
}
}
barrier();
}
if (tid == 0) {
data_d[d_offset + row] = D_TYPE(tmp[0]);
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
}
}
}
void main() {
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
// do NUM_ROWS at a time, unless there aren't enough remaining rows
if (first_row + NUM_ROWS <= p.stride_d) {
compute_outputs(first_row, NUM_ROWS);
} else {
if (first_row >= p.stride_d) {
return;
}
compute_outputs(first_row, p.stride_d - first_row);
}
}

View file

@ -7,21 +7,15 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (constant_id = 1) const uint NUM_ROWS = 1;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
if (row >= p.stride_d) {
return;
}
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
@ -39,12 +33,27 @@ void main() {
const uint q_offset = 32*v_im + l0;
const uint y_offset = 64*v_im + l0;
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
FLOAT_TYPE temp[NUM_ROWS];
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[i] = FLOAT_TYPE(0);
}
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y1_idx = i * QUANT_K + y_offset;
const uint y2_idx = y1_idx + 128;
B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2];
B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24];
B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2];
B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8];
B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
f16vec2 d = data_a[ib0 + i].d;
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
@ -107,15 +116,6 @@ void main() {
const uint32_t q4_14 = qs64_80_hi4.z;
const uint32_t q4_15 = qs64_80_hi4.w;
B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2];
B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24];
B_TYPE_VEC2 by20 = data_b_v2[(b_offset + y2_idx) / 2];
B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8];
B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
const FLOAT_TYPE sx =
fma(FLOAT_TYPE(by10.x), q4_0,
fma(FLOAT_TYPE(by10.y), q4_1,
@ -141,20 +141,40 @@ void main() {
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
temp[n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[n]));
}
}
tmp[gl_LocalInvocationID.x] = temp;
// sum up partial sums and write back result
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] = temp[n];
}
barrier();
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] += tmpsh[n][tid + s];
}
}
barrier();
}
if (tid == 0) {
data_d[d_offset + row] = D_TYPE(tmp[0]);
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
}
}
}
void main() {
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
// do NUM_ROWS at a time, unless there aren't enough remaining rows
if (first_row + NUM_ROWS <= p.stride_d) {
compute_outputs(first_row, NUM_ROWS);
} else {
if (first_row >= p.stride_d) {
return;
}
compute_outputs(first_row, p.stride_d - first_row);
}
}

View file

@ -7,21 +7,15 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (constant_id = 1) const uint NUM_ROWS = 1;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
if (row >= p.stride_d) {
return;
}
shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
@ -42,11 +36,22 @@ void main() {
const uint s_offset = 8*v_im + is;
const uint y_offset = 128*v_im + l0;
FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
FLOAT_TYPE temp[NUM_ROWS];
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[i] = FLOAT_TYPE(0);
}
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;
B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4];
B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
FLOAT_TYPE scales[4];
@ -79,11 +84,6 @@ void main() {
uvec4 q2 = uvec4(unpack8(q2_u32));
uvec4 q3 = uvec4(unpack8(q3_u32));
B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4];
B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 4; ++l) {
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
@ -91,20 +91,40 @@ void main() {
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
}
temp += sum * d;
temp[n] += sum * d;
}
}
tmp[gl_LocalInvocationID.x] = temp;
// sum up partial sums and write back result
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] = temp[n];
}
barrier();
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
tmpsh[n][tid] += tmpsh[n][tid + s];
}
}
barrier();
}
if (tid == 0) {
data_d[d_offset + row] = D_TYPE(tmp[0]);
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
}
}
}
void main() {
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
// do NUM_ROWS at a time, unless there aren't enough remaining rows
if (first_row + NUM_ROWS <= p.stride_d) {
compute_outputs(first_row, NUM_ROWS);
} else {
if (first_row >= p.stride_d) {
return;
}
compute_outputs(first_row, p.stride_d - first_row);
}
}

View file

@ -78,7 +78,8 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s
}
PROCESS_INFORMATION pi;
STARTUPINFOA si = { sizeof(STARTUPINFOA) };
STARTUPINFOA si = {};
si.cb = sizeof(STARTUPINFOA);
si.dwFlags = STARTF_USESTDHANDLES;
si.hStdOutput = stdout_write;
si.hStdError = stderr_write;

View file

@ -126,6 +126,8 @@ connection = sqlite3.connect(input_file)
cursor = connection.cursor()
builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
commit_short_len = len(builds[0][0])
try:
repo = git.Repo(".", search_parent_directories=True)
except git.InvalidGitRepositoryError:
@ -138,11 +140,11 @@ def find_parent_in_data(commit: git.Commit):
seen_hexsha8 = set()
while heap:
depth, current_commit = heapq.heappop(heap)
current_hexsha8 = commit.hexsha[:8]
current_hexsha8 = commit.hexsha[:commit_short_len]
if (current_hexsha8,) in builds:
return current_hexsha8
for parent in commit.parents:
parent_hexsha8 = parent.hexsha[:8]
parent_hexsha8 = parent.hexsha[:commit_short_len]
if parent_hexsha8 not in seen_hexsha8:
seen_hexsha8.add(parent_hexsha8)
heapq.heappush(heap, (depth + 1, parent))
@ -156,9 +158,9 @@ def get_all_parent_hexsha8s(commit: git.Commit):
while unvisited:
current_commit = unvisited.pop(0)
visited.append(current_commit.hexsha[:8])
visited.append(current_commit.hexsha[:commit_short_len])
for parent in current_commit.parents:
if parent.hexsha[:8] not in visited:
if parent.hexsha[:commit_short_len] not in visited:
unvisited.append(parent)
return visited
@ -169,10 +171,10 @@ def get_commit_name(hexsha8):
if repo is None:
return hexsha8
for h in repo.heads:
if h.commit.hexsha[:8] == hexsha8:
if h.commit.hexsha[:commit_short_len] == hexsha8:
return h.name
for t in repo.tags:
if t.commit.hexsha[:8] == hexsha8:
if t.commit.hexsha[:commit_short_len] == hexsha8:
return t.name
return hexsha8
@ -183,13 +185,13 @@ def get_commit_hexsha8(name):
return None
for h in repo.heads:
if h.name == name:
return h.commit.hexsha[:8]
return h.commit.hexsha[:commit_short_len]
for t in repo.tags:
if t.name == name:
return t.commit.hexsha[:8]
return t.commit.hexsha[:commit_short_len]
for c in repo.iter_commits("--all"):
if c.hexsha[:8] == name[:8]:
return c.hexsha[:8]
if c.hexsha[:commit_short_len] == name[:commit_short_len]:
return c.hexsha[:commit_short_len]
return None

View file

@ -26,7 +26,7 @@ function has_cmd {
}
if has_cmd wget; then
cmd="wget -q --show-progress -c -O %s/%s %s"
cmd="wget -q -c -O %s/%s %s"
elif has_cmd curl; then
cmd="curl -C - -f --output-dir %s -o %s -L %s"
else

View file

@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
}
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
return vocab.special_bos_id;
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
}
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {

View file

@ -45,7 +45,7 @@ struct llama_vocab {
id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
id special_mask_id = LLAMA_TOKEN_NULL;
id linefeed_id = 13;

View file

@ -1720,6 +1720,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_RWKV_WORLD,
LLM_CHAT_TEMPLATE_GRANITE,
LLM_CHAT_TEMPLATE_GIGACHAT,
LLM_CHAT_TEMPLATE_MEGREZ,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
@ -1753,6 +1754,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
};
static llm_arch llm_arch_from_string(const std::string & name) {
@ -6703,6 +6705,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else if (
tokenizer_pre == "megrez") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -22931,6 +22936,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_GRANITE;
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
return LLM_CHAT_TEMPLATE_GIGACHAT;
} else if (tmpl_contains("<|role_start|>")) {
return LLM_CHAT_TEMPLATE_MEGREZ;
}
return LLM_CHAT_TEMPLATE_UNKNOWN;
}
@ -23289,6 +23296,16 @@ static int32_t llama_chat_apply_template_internal(
if (add_ass) {
ss << "assistant<|role_sep|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
// Megrez template
for (auto message : chat) {
std::string role(message->role);
ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
}
if (add_ass) {
ss << "<|role_start|>assistant<|role_end|>";
}
} else {
// template not supported
return -1;

View file

@ -77,6 +77,8 @@ int main(void) {
"{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
// ai-sage/GigaChat-20B-A3B-instruct
"{% if messages[0]['role'] == 'system' -%}\n {%- set loop_messages = messages[1:] -%}\n {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n {%- set loop_messages = messages -%}\n {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {% endif %}\n \n {%- if loop.index0 == 0 -%}\n {{ system_message -}}\n {%- endif -%}\n {%- if message['role'] == 'user' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if message['role'] == 'assistant' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if loop.last and add_generation_prompt -%}\n {{ 'assistant' + additional_special_tokens[0] -}}\n {%- endif -%}\n{%- endfor %}",
// Infinigence/Megrez-3B-Instruct
u8"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}"
};
std::vector<std::string> expected_output = {
// teknium/OpenHermes-2.5-Mistral-7B
@ -133,6 +135,8 @@ int main(void) {
"[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant </s>[INST] Another question[/INST]",
// ai-sage/GigaChat-20B-A3B-instruct
"<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|> I am an assistant <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
// Infinigence/Megrez-3B-Instruct
"<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|> I am an assistant <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>",
};
std::vector<char> formatted_chat(1024);
int32_t res;