Merge remote-tracking branch 'origin/master' into sync
ggml-ci
This commit is contained in:
commit
76d5b7f76c
26 changed files with 1464 additions and 335 deletions
1
.flake8
1
.flake8
|
@ -1,2 +1,3 @@
|
||||||
[flake8]
|
[flake8]
|
||||||
max-line-length = 125
|
max-line-length = 125
|
||||||
|
ignore = W503
|
||||||
|
|
|
@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.samplers_sequence = parse_samplers_input(argv[i]);
|
const auto sampler_names = string_split(argv[i], ';');
|
||||||
|
sparams.samplers_sequence = sampler_types_from_names(sampler_names);
|
||||||
} else if (arg == "--sampling-seq") {
|
} else if (arg == "--sampling-seq") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.samplers_sequence = argv[i];
|
sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
|
||||||
} else if (arg == "--top-p") {
|
} else if (arg == "--top-p") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -906,6 +907,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
const llama_sampling_params & sparams = params.sparams;
|
const llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
|
std::string sampler_type_chars;
|
||||||
|
std::string sampler_type_names;
|
||||||
|
for (const auto sampler_type : sparams.samplers_sequence) {
|
||||||
|
sampler_type_chars += static_cast<char>(sampler_type);
|
||||||
|
sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
|
||||||
|
}
|
||||||
|
sampler_type_names.pop_back();
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -947,8 +956,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
|
printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str());
|
||||||
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
|
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
|
@ -1097,45 +1106,85 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// String parsing
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::string parse_samplers_input(std::string input) {
|
std::vector<std::string> string_split(std::string input, char separator) {
|
||||||
std::string output = "";
|
std::vector<std::string> parts;
|
||||||
|
size_t separator_pos = input.find(separator);
|
||||||
|
while (separator_pos != std::string::npos) {
|
||||||
|
std::string part = input.substr(0, separator_pos);
|
||||||
|
parts.emplace_back(part);
|
||||||
|
input = input.substr(separator_pos + 1);
|
||||||
|
separator_pos = input.find(separator);
|
||||||
|
}
|
||||||
|
parts.emplace_back(input);
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names) {
|
||||||
// since samplers names are written multiple ways
|
// since samplers names are written multiple ways
|
||||||
// make it ready for both system names and input names
|
// make it ready for both system names and input names
|
||||||
std::unordered_map<std::string, char> samplers_symbols {
|
std::unordered_map<std::string, llama_sampler_type> sampler_name_map {
|
||||||
{"top_k", 'k'},
|
{"top_k", llama_sampler_type::TOP_K},
|
||||||
{"top-k", 'k'},
|
{"top-k", llama_sampler_type::TOP_K},
|
||||||
{"top_p", 'p'},
|
{"top_p", llama_sampler_type::TOP_P},
|
||||||
{"top-p", 'p'},
|
{"top-p", llama_sampler_type::TOP_P},
|
||||||
{"nucleus", 'p'},
|
{"nucleus", llama_sampler_type::TOP_P},
|
||||||
{"typical_p", 'y'},
|
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||||
{"typical-p", 'y'},
|
{"typical-p", llama_sampler_type::TYPICAL_P},
|
||||||
{"typical", 'y'},
|
{"typical", llama_sampler_type::TYPICAL_P},
|
||||||
{"min_p", 'm'},
|
{"min_p", llama_sampler_type::MIN_P},
|
||||||
{"min-p", 'm'},
|
{"min-p", llama_sampler_type::MIN_P},
|
||||||
{"tfs_z", 'f'},
|
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||||
{"tfs-z", 'f'},
|
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||||
{"tfs", 'f'},
|
{"tfs", llama_sampler_type::TFS_Z},
|
||||||
{"temp", 't'},
|
{"temp", llama_sampler_type::TEMP},
|
||||||
{"temperature",'t'}
|
{"temperature", llama_sampler_type::TEMP}
|
||||||
};
|
};
|
||||||
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
|
|
||||||
size_t separator = input.find(';');
|
|
||||||
while (separator != input.npos) {
|
|
||||||
std::string name = input.substr(0,separator);
|
|
||||||
input = input.substr(separator+1);
|
|
||||||
separator = input.find(';');
|
|
||||||
|
|
||||||
if (samplers_symbols.find(name) != samplers_symbols.end()) {
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
output += samplers_symbols[name];
|
sampler_types.reserve(names.size());
|
||||||
|
for (const auto& name : names) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (samplers_symbols.find(input) != samplers_symbols.end()) {
|
return sampler_types;
|
||||||
output += samplers_symbols[input];
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
|
||||||
|
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
||||||
|
{'k', llama_sampler_type::TOP_K},
|
||||||
|
{'p', llama_sampler_type::TOP_P},
|
||||||
|
{'y', llama_sampler_type::TYPICAL_P},
|
||||||
|
{'m', llama_sampler_type::MIN_P},
|
||||||
|
{'f', llama_sampler_type::TFS_Z},
|
||||||
|
{'t', llama_sampler_type::TEMP}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
|
sampler_types.reserve(names_string.size());
|
||||||
|
for (const auto & c : names_string) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(c);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
|
||||||
|
switch (sampler_type) {
|
||||||
|
case llama_sampler_type::TOP_K: return "top_k";
|
||||||
|
case llama_sampler_type::TFS_Z: return "tfs_z";
|
||||||
|
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
||||||
|
case llama_sampler_type::TOP_P: return "top_p";
|
||||||
|
case llama_sampler_type::MIN_P: return "min_p";
|
||||||
|
case llama_sampler_type::TEMP: return "temp";
|
||||||
|
default : return "";
|
||||||
}
|
}
|
||||||
return output;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -1550,6 +1599,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||||
|
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
||||||
|
|
||||||
#ifdef NDEBUG
|
#ifdef NDEBUG
|
||||||
fprintf(stream, "debug: false\n");
|
fprintf(stream, "debug: false\n");
|
||||||
|
|
|
@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
void process_escapes(std::string& input);
|
void process_escapes(std::string& input);
|
||||||
|
|
||||||
//
|
//
|
||||||
// String parsing
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::string parse_samplers_input(std::string input);
|
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names);
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
|
||||||
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
|
|
@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
std::string result = "CFG -> Penalties ";
|
std::string result = "CFG -> Penalties ";
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (auto s : params.samplers_sequence) {
|
for (auto sampler_type : params.samplers_sequence) {
|
||||||
switch (s) {
|
const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
|
||||||
case 'k': result += "-> top_k "; break;
|
if (!sampler_type_name.empty()) {
|
||||||
case 'f': result += "-> tfs_z "; break;
|
result += "-> " + sampler_type_name + " ";
|
||||||
case 'y': result += "-> typical_p "; break;
|
|
||||||
case 'p': result += "-> top_p "; break;
|
|
||||||
case 'm': result += "-> min_p "; break;
|
|
||||||
case 't': result += "-> temp "; break;
|
|
||||||
default : break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -127,8 +122,6 @@ static void sampler_queue(
|
||||||
const llama_sampling_params & params,
|
const llama_sampling_params & params,
|
||||||
llama_token_data_array & cur_p,
|
llama_token_data_array & cur_p,
|
||||||
size_t & min_keep) {
|
size_t & min_keep) {
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
|
||||||
|
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const float dynatemp_range = params.dynatemp_range;
|
const float dynatemp_range = params.dynatemp_range;
|
||||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||||
|
@ -137,16 +130,16 @@ static void sampler_queue(
|
||||||
const float min_p = params.min_p;
|
const float min_p = params.min_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
const std::string & samplers_sequence = params.samplers_sequence;
|
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
||||||
|
|
||||||
for (auto s : samplers_sequence) {
|
for (auto sampler_type : samplers_sequence) {
|
||||||
switch (s){
|
switch (sampler_type) {
|
||||||
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
||||||
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
||||||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||||
case 't':
|
case llama_sampler_type::TEMP:
|
||||||
if (dynatemp_range > 0) {
|
if (dynatemp_range > 0) {
|
||||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||||
|
|
|
@ -8,6 +8,16 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// sampler types
|
||||||
|
enum class llama_sampler_type : char {
|
||||||
|
TOP_K = 'k',
|
||||||
|
TOP_P = 'p',
|
||||||
|
MIN_P = 'm',
|
||||||
|
TFS_Z = 'f',
|
||||||
|
TYPICAL_P = 'y',
|
||||||
|
TEMP = 't'
|
||||||
|
};
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
typedef struct llama_sampling_params {
|
typedef struct llama_sampling_params {
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
|
@ -28,7 +38,15 @@ typedef struct llama_sampling_params {
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||||
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
|
|
||||||
|
std::vector<llama_sampler_type> samplers_sequence = {
|
||||||
|
llama_sampler_type::TOP_K,
|
||||||
|
llama_sampler_type::TFS_Z,
|
||||||
|
llama_sampler_type::TYPICAL_P,
|
||||||
|
llama_sampler_type::TOP_P,
|
||||||
|
llama_sampler_type::MIN_P,
|
||||||
|
llama_sampler_type::TEMP
|
||||||
|
};
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
|
|
@ -209,6 +209,8 @@ class Model:
|
||||||
return InternLM2Model
|
return InternLM2Model
|
||||||
if model_architecture == "MiniCPMForCausalLM":
|
if model_architecture == "MiniCPMForCausalLM":
|
||||||
return MiniCPMModel
|
return MiniCPMModel
|
||||||
|
if model_architecture == "BertModel":
|
||||||
|
return BertModel
|
||||||
return Model
|
return Model
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
def _is_model_safetensors(self) -> bool:
|
||||||
|
@ -264,6 +266,8 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.INTERNLM2
|
return gguf.MODEL_ARCH.INTERNLM2
|
||||||
if arch == "MiniCPMForCausalLM":
|
if arch == "MiniCPMForCausalLM":
|
||||||
return gguf.MODEL_ARCH.MINICPM
|
return gguf.MODEL_ARCH.MINICPM
|
||||||
|
if arch == "BertModel":
|
||||||
|
return gguf.MODEL_ARCH.BERT
|
||||||
|
|
||||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
|
@ -1629,6 +1633,96 @@ in chat mode so that the conversation can end normally.")
|
||||||
self.post_write_tensors(tensor_map, name, data_torch)
|
self.post_write_tensors(tensor_map, name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
|
class BertModel(Model):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.block_count = self.hparams["num_hidden_layers"]
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
# TODO(cebtenzzre): merge with parent class
|
||||||
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||||
|
self.gguf_writer.add_causal_attention(False)
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
path = self.dir_model
|
||||||
|
added_tokens_path = self.dir_model if self.dir_model.exists() else None
|
||||||
|
|
||||||
|
# use huggingface vocab to get all tokens
|
||||||
|
vocab = HfVocab(path, added_tokens_path)
|
||||||
|
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
||||||
|
assert len(tokens) == vocab.vocab_size
|
||||||
|
|
||||||
|
# we need this to validate the size of the token_type embeddings
|
||||||
|
# though currently we are passing all zeros to the token_type embeddings
|
||||||
|
n_token_types = len(set(toktypes))
|
||||||
|
self.gguf_writer.add_token_type_count(n_token_types)
|
||||||
|
|
||||||
|
# convert to phantom space vocab
|
||||||
|
def phantom(tok, typ):
|
||||||
|
if tok.startswith(b"[") and tok.endswith(b"]"):
|
||||||
|
return tok
|
||||||
|
if tok.startswith(b"##"):
|
||||||
|
return tok[2:]
|
||||||
|
return b"\xe2\x96\x81" + tok
|
||||||
|
tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)]
|
||||||
|
|
||||||
|
# set up bos and eos tokens (cls and sep)
|
||||||
|
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
|
||||||
|
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
|
||||||
|
|
||||||
|
# add vocab to gguf
|
||||||
|
self.gguf_writer.add_tokenizer_model("bert")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
# handle special tokens
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
|
tensors = dict(self.get_tensors())
|
||||||
|
for name, data_torch in tensors.items():
|
||||||
|
# we are only using BERT for embeddings so we don't need the pooling layer
|
||||||
|
if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
|
||||||
|
continue # we don't need these
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
new_dtype: type[np.floating[Any]]
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.ftype == 1 and name.endswith(".weight") and n_dims == 2
|
||||||
|
and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
|
||||||
|
):
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
new_dtype = np.float16
|
||||||
|
else:
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
new_dtype = np.float32
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
|
||||||
|
|
||||||
|
if data.dtype != new_dtype:
|
||||||
|
data = data.astype(new_dtype)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -87,7 +87,17 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
const auto * embeddings = llama_get_embeddings(ctx);
|
auto * embeddings = llama_get_embeddings(ctx);
|
||||||
|
|
||||||
|
// l2-normalize embeddings
|
||||||
|
float norm = 0;
|
||||||
|
for (int i = 0; i < n_embd; i++) {
|
||||||
|
norm += embeddings[i] * embeddings[i];
|
||||||
|
}
|
||||||
|
norm = sqrt(norm);
|
||||||
|
for (int i = 0; i < n_embd; i++) {
|
||||||
|
embeddings[i] /= norm;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_embd; i++) {
|
for (int i = 0; i < n_embd; i++) {
|
||||||
printf("%f ", embeddings[i]);
|
printf("%f ", embeddings[i]);
|
||||||
|
|
|
@ -98,7 +98,7 @@ static void write_logfile(
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
static void sigint_handler(int signo) {
|
static void sigint_handler(int signo) {
|
||||||
if (signo == SIGINT) {
|
if (signo == SIGINT) {
|
||||||
if (!is_interacting) {
|
if (!is_interacting && g_params->interactive) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
|
@ -392,7 +392,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.interactive) {
|
// ctrl+C handling
|
||||||
|
{
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
struct sigaction sigint_action;
|
struct sigaction sigint_action;
|
||||||
sigint_action.sa_handler = sigint_handler;
|
sigint_action.sa_handler = sigint_handler;
|
||||||
|
@ -405,7 +406,9 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.interactive) {
|
||||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
LOG_TEE("%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
|
|
|
@ -185,7 +185,7 @@ node index.js
|
||||||
|
|
||||||
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
||||||
|
|
||||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
|
||||||
|
|
||||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||||
|
|
||||||
|
|
|
@ -626,18 +626,36 @@ struct llama_server_context
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
for (const auto &el : *logit_bias)
|
for (const auto &el : *logit_bias)
|
||||||
{
|
{
|
||||||
if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
if (el.is_array() && el.size() == 2)
|
||||||
|
{
|
||||||
|
float bias;
|
||||||
|
if (el[1].is_number())
|
||||||
|
{
|
||||||
|
bias = el[1].get<float>();
|
||||||
|
}
|
||||||
|
else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||||
|
{
|
||||||
|
bias = -INFINITY;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (el[0].is_number_integer())
|
||||||
{
|
{
|
||||||
llama_token tok = el[0].get<llama_token>();
|
llama_token tok = el[0].get<llama_token>();
|
||||||
if (tok >= 0 && tok < n_vocab)
|
if (tok >= 0 && tok < n_vocab)
|
||||||
{
|
{
|
||||||
if (el[1].is_number())
|
slot->sparams.logit_bias[tok] = bias;
|
||||||
{
|
|
||||||
slot->sparams.logit_bias[tok] = el[1].get<float>();
|
|
||||||
}
|
}
|
||||||
else if (el[1].is_boolean() && !el[1].get<bool>())
|
}
|
||||||
|
else if (el[0].is_string())
|
||||||
{
|
{
|
||||||
slot->sparams.logit_bias[tok] = -INFINITY;
|
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||||
|
for (auto tok : toks)
|
||||||
|
{
|
||||||
|
slot->sparams.logit_bias[tok] = bias;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1706732774,
|
"lastModified": 1707268954,
|
||||||
"narHash": "sha256-hqJlyJk4MRpcItGYMF+3uHe8HvxNETWvlGtLuVpqLU0=",
|
"narHash": "sha256-2en1kvde3cJVc3ZnTy8QeD2oKcseLFjYPLKhIGDanQ0=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "b8b232ae7b8b144397fdb12d20f592e5e7c1a64d",
|
"rev": "f8e2ebd66d097614d51a56a755450d4ae1632df1",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
223
ggml-cuda.cu
223
ggml-cuda.cu
|
@ -150,8 +150,8 @@
|
||||||
#define CUDA_USE_TENSOR_CORES
|
#define CUDA_USE_TENSOR_CORES
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// max batch size to use MMQ kernels when tensor cores are available
|
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
||||||
#define MMQ_MAX_BATCH_SIZE 32
|
#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#define __CUDA_ARCH__ 1300
|
#define __CUDA_ARCH__ 1300
|
||||||
|
@ -5310,51 +5310,59 @@ template <bool need_check> static __global__ void
|
||||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MMVQ_NWARPS_NVIDIA 4
|
template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||||
#define MMVQ_NWARPS_AMD_RDNA2 1
|
|
||||||
#define MMVQ_NWARPS_AMD_OLD 4
|
|
||||||
|
|
||||||
template <int nwarps, int ncols_y_template, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1) // tells the compiler to use as many registers as it wants
|
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||||
|
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void mul_mat_vec_q(
|
static __global__ void mul_mat_vec_q(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y_par, const int nrows_dst) {
|
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||||
|
|
||||||
const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par;
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
||||||
|
constexpr int nwarps = 1;
|
||||||
|
constexpr int rows_per_cuda_block = 1;
|
||||||
|
#else
|
||||||
|
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
||||||
|
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
||||||
|
|
||||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||||
const int row = blockIdx.x;
|
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||||
|
|
||||||
const int blocks_per_row_x = ncols_x / qk;
|
const int blocks_per_row_x = ncols_x / qk;
|
||||||
const int blocks_per_col_y = nrows_y / QK8_1;
|
const int blocks_per_col_y = nrows_y / QK8_1;
|
||||||
const int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
||||||
|
|
||||||
// partial sum for each thread
|
// partial sum for each thread
|
||||||
float tmp[ncols_y_template != 0 ? ncols_y_template : 8] = {0.0f};
|
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
||||||
|
|
||||||
const block_q_t * x = (const block_q_t *) vx;
|
const block_q_t * x = (const block_q_t *) vx;
|
||||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
|
||||||
for (int i = tid / (qi/vdr); i < blocks_per_row_x; i += blocks_per_iter) {
|
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
||||||
const int ibx = row*blocks_per_row_x + i; // x block index
|
const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
|
||||||
|
|
||||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
// x block quant index when casting the quants to int
|
||||||
|
const int kqs = vdr * (tid % (qi/vdr));
|
||||||
const int iqs = vdr * (tid % (qi/vdr)); // x block quant index when casting the quants to int
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols_y; ++j) {
|
for (int j = 0; j < ncols_y; ++j) {
|
||||||
tmp[j] += vec_dot_q_cuda(&x[ibx], &y[j*blocks_per_col_y + iby], iqs);
|
#pragma unroll
|
||||||
|
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||||
|
tmp[j][i] += vec_dot_q_cuda(
|
||||||
|
&x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y_template != 0 ? ncols_y_template : 8][WARP_SIZE];
|
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
|
||||||
if (threadIdx.y > 0) {
|
if (threadIdx.y > 0) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols_y; ++j) {
|
for (int j = 0; j < ncols_y; ++j) {
|
||||||
tmp_shared[threadIdx.y-1][j][threadIdx.x] = tmp[j];
|
#pragma unroll
|
||||||
|
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||||
|
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
@ -5366,13 +5374,16 @@ static __global__ void mul_mat_vec_q(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols_y; ++j) {
|
for (int j = 0; j < ncols_y; ++j) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < nwarps-1; ++i) {
|
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
||||||
tmp[j] += tmp_shared[i][j][threadIdx.x];
|
#pragma unroll
|
||||||
|
for (int l = 0; l < nwarps-1; ++l) {
|
||||||
|
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
||||||
|
}
|
||||||
|
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
||||||
}
|
}
|
||||||
tmp[j] = warp_reduce_sum(tmp[j]);
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x < rows_per_cuda_block) {
|
||||||
dst[j*nrows_dst + row] = tmp[j];
|
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6851,65 +6862,75 @@ static void mul_mat_vec_q_cuda(
|
||||||
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
GGML_ASSERT(ncols_x % qk == 0);
|
GGML_ASSERT(ncols_x % qk == 0);
|
||||||
GGML_ASSERT(ncols_y <= 4);
|
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
|
||||||
int nwarps;
|
int64_t nwarps = 1;
|
||||||
if (g_device_caps[id].cc >= CC_OFFSET_AMD) {
|
int64_t rows_per_cuda_block = 1;
|
||||||
nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD;
|
|
||||||
} else {
|
|
||||||
nwarps = MMVQ_NWARPS_NVIDIA;
|
|
||||||
}
|
|
||||||
|
|
||||||
const dim3 block_nums(nrows_x, 1, 1);
|
if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
||||||
|
switch(ncols_y) {
|
||||||
|
case 1:
|
||||||
|
nwarps = 4;
|
||||||
|
rows_per_cuda_block = 1;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
case 3:
|
||||||
|
case 4:
|
||||||
|
nwarps = 4;
|
||||||
|
rows_per_cuda_block = 2;
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
case 6:
|
||||||
|
case 7:
|
||||||
|
case 8:
|
||||||
|
nwarps = 2;
|
||||||
|
rows_per_cuda_block = 2;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
||||||
|
const dim3 block_nums(nblocks, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
||||||
|
|
||||||
switch (nwarps) {
|
switch (ncols_y) {
|
||||||
case 1: switch(ncols_y) {
|
|
||||||
case 1:
|
case 1:
|
||||||
mul_mat_vec_q<1, 1, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
mul_mat_vec_q<1, 4, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
default:
|
case 5:
|
||||||
GGML_ASSERT(false);
|
mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
} break;
|
case 6:
|
||||||
case 4: switch(ncols_y) {
|
mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
case 1:
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot>
|
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 7:
|
||||||
mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 8:
|
||||||
mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot>
|
mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
case 4:
|
|
||||||
mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot>
|
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
|
||||||
} break;
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
|
@ -9735,7 +9756,7 @@ static __global__ void k_compute_batched_ptrs(
|
||||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
|
|
||||||
|
@ -9893,39 +9914,69 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
|
|
||||||
|
bool any_pascal_with_slow_fp16 = false;
|
||||||
if (split) {
|
if (split) {
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||||
auto & tensor_split = buft_ctx->tensor_split;
|
auto & tensor_split = buft_ctx->tensor_split;
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
// skip devices that are not going to do any work:
|
||||||
|
if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (min_compute_capability > g_device_caps[id].cc) {
|
||||||
min_compute_capability = g_device_caps[id].cc;
|
min_compute_capability = g_device_caps[id].cc;
|
||||||
}
|
}
|
||||||
|
if (g_device_caps[id].cc == 610) {
|
||||||
|
any_pascal_with_slow_fp16 = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
min_compute_capability = g_device_caps[g_main_device].cc;
|
min_compute_capability = g_device_caps[g_main_device].cc;
|
||||||
|
any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check data types and tensor shapes for custom matrix multiplication kernels:
|
||||||
|
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||||
|
|
||||||
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
|
|
||||||
|
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
||||||
bool use_mul_mat_q = ggml_is_quantized(src0->type);
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
#ifdef CUDA_USE_TENSOR_CORES
|
||||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
#endif // CUDA_USE_TENSOR_CORES
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
|
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
||||||
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
||||||
|
|
||||||
|
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
||||||
|
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
||||||
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
||||||
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
#ifdef CUDA_USE_TENSOR_CORES
|
||||||
// when tensor cores are available, use them for large batch size
|
// when tensor cores are available, use them for large batch size
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||||
use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
|
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
#endif // CUDA_USE_TENSOR_CORES
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type);
|
// if mmvq is available it's a better choice than dmmv:
|
||||||
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||||
|
#endif // GGML_CUDA_FORCE_DMMV
|
||||||
|
|
||||||
// debug helpers
|
// debug helpers
|
||||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||||
|
@ -9943,34 +9994,16 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||||
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
|
||||||
} else if (src0->type == GGML_TYPE_F32) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
|
||||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
|
||||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
|
|
||||||
#ifdef GGML_CUDA_FORCE_DMMV
|
|
||||||
const bool use_mul_mat_vec_q = false;
|
|
||||||
#else
|
|
||||||
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
|
||||||
#endif // GGML_CUDA_FORCE_DMMV
|
|
||||||
|
|
||||||
if (use_mul_mat_vec_q) {
|
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
|
||||||
} else {
|
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||||
}
|
} else if (use_mul_mat_vec_q) {
|
||||||
} else {
|
|
||||||
if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) {
|
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
||||||
} else if (use_mul_mat_q) {
|
} else if (use_mul_mat_q) {
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
||||||
} else {
|
} else {
|
||||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
332
ggml-quants.c
332
ggml-quants.c
|
@ -49,6 +49,8 @@
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
||||||
|
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||||
|
@ -3677,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_0 * restrict x = vx;
|
const block_q4_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q4_0 * restrict vx0 = vx;
|
||||||
|
const block_q4_0 * restrict vx1 = vx + bx;
|
||||||
|
|
||||||
|
const block_q8_0 * restrict vy0 = vy;
|
||||||
|
const block_q8_0 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q4_0 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q4_0 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_0 * restrict b_y0 = &vy0[i];
|
||||||
|
const block_q8_0 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
const int8x16_t s8b = vdupq_n_s8(0x8);
|
||||||
|
|
||||||
|
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
||||||
|
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
||||||
|
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
||||||
|
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
||||||
|
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
||||||
|
|
||||||
|
// sub 8
|
||||||
|
const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
|
||||||
|
const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
|
||||||
|
const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
|
||||||
|
const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
@ -3967,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_1;
|
const int qk = QK8_1;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_1 * restrict x = vx;
|
const block_q4_1 * restrict x = vx;
|
||||||
const block_q8_1 * restrict y = vy;
|
const block_q8_1 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q4_1 * restrict vx0 = vx;
|
||||||
|
const block_q4_1 * restrict vx1 = vx + bx;
|
||||||
|
const block_q8_1 * restrict vy0 = vy;
|
||||||
|
const block_q8_1 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q4_1 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q4_1 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
||||||
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
|
||||||
|
summs0 += summs_t;
|
||||||
|
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
|
||||||
|
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
||||||
|
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
||||||
|
const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
||||||
|
const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
||||||
|
const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
// mmla into int32x4_t
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
sumv2 = sumv2 + summs0;
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// TODO: add WASM SIMD
|
// TODO: add WASM SIMD
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
@ -4107,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
assert(qk == QK5_0);
|
assert(qk == QK5_0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_0 * restrict x = vx;
|
const block_q5_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
@ -4393,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_1;
|
const int qk = QK8_1;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
assert(qk == QK5_1);
|
assert(qk == QK5_1);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_1 * restrict x = vx;
|
const block_q5_1 * restrict x = vx;
|
||||||
const block_q8_1 * restrict y = vy;
|
const block_q8_1 * restrict y = vy;
|
||||||
|
@ -4692,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q8_0 * restrict x = vx;
|
const block_q8_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q8_0 * restrict vx0 = vx;
|
||||||
|
const block_q8_0 * restrict vx1 = vx + bx;
|
||||||
|
const block_q8_0 * restrict vy0 = vy;
|
||||||
|
const block_q8_0 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q8_0 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q8_0 * restrict b_y0 = &vy0[i];
|
||||||
|
|
||||||
|
const block_q8_0 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_0 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
|
||||||
|
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
|
||||||
|
const int8x16_t x1_l = vld1q_s8(b_x1->qs);
|
||||||
|
const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
@ -4795,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q2_K * restrict x = vx;
|
const block_q2_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -5171,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q2_K * restrict x = vx;
|
const block_q2_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -5429,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const uint32_t kmask1 = 0x03030303;
|
const uint32_t kmask1 = 0x03030303;
|
||||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
@ -5949,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q3_K * restrict x = vx;
|
const block_q3_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6292,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_K * restrict x = vx;
|
const block_q4_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6648,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_K * restrict x = vx;
|
const block_q4_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6891,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_K * restrict x = vx;
|
const block_q5_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -7311,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_K * restrict x = vx;
|
const block_q5_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -7577,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q6_K * restrict x = vx;
|
const block_q6_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8009,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q6_K * restrict x = vx;
|
const block_q6_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8339,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
|
||||||
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||||
};
|
};
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq2_xxs * restrict x = vx;
|
const block_iq2_xxs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8462,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq2_xs * restrict x = vx;
|
const block_iq2_xs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8682,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq3_xxs * restrict x = vx;
|
const block_iq3_xxs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
|
|
@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
|
|
103
ggml-vulkan.cpp
103
ggml-vulkan.cpp
|
@ -27,6 +27,7 @@
|
||||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||||
|
|
||||||
#define VK_VENDOR_ID_AMD 0x1002
|
#define VK_VENDOR_ID_AMD 0x1002
|
||||||
|
#define VK_VENDOR_ID_APPLE 0x106b
|
||||||
#define VK_VENDOR_ID_INTEL 0x8086
|
#define VK_VENDOR_ID_INTEL 0x8086
|
||||||
#define VK_VENDOR_ID_NVIDIA 0x10de
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
||||||
|
|
||||||
|
@ -2034,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
|
||||||
return ctx->pipeline_matmul_f32_aligned_l.align;
|
return ctx->pipeline_matmul_f32_aligned_l.align;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
|
||||||
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
|
||||||
#endif
|
|
||||||
if (bit16_x && bit16_y) {
|
if (bit16_x && bit16_y) {
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
|
}
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
|
}
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
||||||
|
#endif
|
||||||
|
switch (ctx->device.lock()->vendor_id) {
|
||||||
|
case VK_VENDOR_ID_AMD:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
|
||||||
|
case VK_VENDOR_ID_APPLE:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
|
||||||
|
case VK_VENDOR_ID_INTEL:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
|
}
|
||||||
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -2057,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
||||||
}
|
}
|
||||||
if (bit16_x && !bit16_y) {
|
if (bit16_x && !bit16_y) {
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -2078,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
164
ggml.c
164
ggml.c
|
@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
|
||||||
|
|
||||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||||
|
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||||
|
|
||||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I8] = {
|
[GGML_TYPE_I8] = {
|
||||||
|
@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
||||||
.vec_dot_type = GGML_TYPE_F32,
|
.vec_dot_type = GGML_TYPE_F32,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_F16] = {
|
[GGML_TYPE_F16] = {
|
||||||
.type_name = "f16",
|
.type_name = "f16",
|
||||||
|
@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||||
.vec_dot_type = GGML_TYPE_F16,
|
.vec_dot_type = GGML_TYPE_F16,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0] = {
|
[GGML_TYPE_Q4_0] = {
|
||||||
.type_name = "q4_0",
|
.type_name = "q4_0",
|
||||||
|
@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_1] = {
|
[GGML_TYPE_Q4_1] = {
|
||||||
.type_name = "q4_1",
|
.type_name = "q4_1",
|
||||||
|
@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[4] = { // GGML_TYPE_Q4_2
|
[4] = { // GGML_TYPE_Q4_2
|
||||||
.type_name = "DEPRECATED",
|
.type_name = "DEPRECATED",
|
||||||
|
@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
.vec_dot_type = GGML_TYPE_COUNT,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[5] = { // GGML_TYPE_Q4_3
|
[5] = { // GGML_TYPE_Q4_3
|
||||||
.type_name = "DEPRECATED",
|
.type_name = "DEPRECATED",
|
||||||
|
@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
.vec_dot_type = GGML_TYPE_COUNT,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_0] = {
|
[GGML_TYPE_Q5_0] = {
|
||||||
.type_name = "q5_0",
|
.type_name = "q5_0",
|
||||||
|
@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_1] = {
|
[GGML_TYPE_Q5_1] = {
|
||||||
.type_name = "q5_1",
|
.type_name = "q5_1",
|
||||||
|
@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_0] = {
|
[GGML_TYPE_Q8_0] = {
|
||||||
.type_name = "q8_0",
|
.type_name = "q8_0",
|
||||||
|
@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_1] = {
|
[GGML_TYPE_Q8_1] = {
|
||||||
.type_name = "q8_1",
|
.type_name = "q8_1",
|
||||||
|
@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float = quantize_row_q8_1,
|
.from_float = quantize_row_q8_1,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q2_K] = {
|
[GGML_TYPE_Q2_K] = {
|
||||||
.type_name = "q2_K",
|
.type_name = "q2_K",
|
||||||
|
@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q3_K] = {
|
[GGML_TYPE_Q3_K] = {
|
||||||
.type_name = "q3_K",
|
.type_name = "q3_K",
|
||||||
|
@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_K] = {
|
[GGML_TYPE_Q4_K] = {
|
||||||
.type_name = "q4_K",
|
.type_name = "q4_K",
|
||||||
|
@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_K] = {
|
[GGML_TYPE_Q5_K] = {
|
||||||
.type_name = "q5_K",
|
.type_name = "q5_K",
|
||||||
|
@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q6_K] = {
|
[GGML_TYPE_Q6_K] = {
|
||||||
.type_name = "q6_K",
|
.type_name = "q6_K",
|
||||||
|
@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_XXS] = {
|
[GGML_TYPE_IQ2_XXS] = {
|
||||||
.type_name = "iq2_xxs",
|
.type_name = "iq2_xxs",
|
||||||
|
@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_XS] = {
|
[GGML_TYPE_IQ2_XS] = {
|
||||||
.type_name = "iq2_xs",
|
.type_name = "iq2_xs",
|
||||||
|
@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ3_XXS] = {
|
[GGML_TYPE_IQ3_XXS] = {
|
||||||
.type_name = "iq3_xxs",
|
.type_name = "iq3_xxs",
|
||||||
|
@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
||||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_K] = {
|
[GGML_TYPE_Q8_K] = {
|
||||||
.type_name = "q8_K",
|
.type_name = "q8_K",
|
||||||
|
@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
|
||||||
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
||||||
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
||||||
|
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
#ifdef GGML_SIMD
|
#ifdef GGML_SIMD
|
||||||
float sumf = 0.0f;
|
float sumf = 0.0f;
|
||||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||||
|
@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
ggml_float sumf = 0.0;
|
ggml_float sumf = 0.0;
|
||||||
|
|
||||||
#if defined(GGML_SIMD)
|
#if defined(GGML_SIMD)
|
||||||
|
@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
|
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
||||||
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
||||||
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
||||||
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
||||||
|
@ -9992,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||||
|
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||||
|
|
||||||
GGML_ASSERT(ne0 == ne01);
|
GGML_ASSERT(ne0 == ne01);
|
||||||
GGML_ASSERT(ne1 == ne11);
|
GGML_ASSERT(ne1 == ne11);
|
||||||
|
@ -10159,12 +10202,23 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const int64_t blck_0 = 16;
|
const int64_t blck_0 = 16;
|
||||||
const int64_t blck_1 = 16;
|
const int64_t blck_1 = 16;
|
||||||
|
|
||||||
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
||||||
|
int64_t nrc = vec_dot_num_rows;
|
||||||
|
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
||||||
|
// this check can be removed once they are extended to support odd numbered rows/cols too
|
||||||
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
||||||
|
nrc = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
||||||
|
|
||||||
// attempt to reduce false-sharing (does not seem to make a difference)
|
// attempt to reduce false-sharing (does not seem to make a difference)
|
||||||
float tmp[16];
|
// 16 * 2, accounting for mmla kernels
|
||||||
|
float tmp[32];
|
||||||
|
|
||||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
|
||||||
const int64_t i13 = (ir1/(ne12*ne1));
|
const int64_t i13 = (ir1/(ne12*ne1));
|
||||||
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
||||||
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
||||||
|
@ -10187,17 +10241,19 @@ static void ggml_compute_forward_mul_mat(
|
||||||
(src1_cont || src1->type != vec_dot_type
|
(src1_cont || src1->type != vec_dot_type
|
||||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||||
|
|
||||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||||
|
|
||||||
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||||
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
|
||||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int cn = 0; cn < nrc; ++cn) {
|
||||||
|
memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||||
}
|
}
|
||||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10386,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id(
|
||||||
//}
|
//}
|
||||||
|
|
||||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
|
||||||
}
|
}
|
||||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||||
}
|
}
|
||||||
|
@ -11568,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
||||||
|
|
||||||
// linear runtime, no additional memory
|
// linear runtime, no additional memory
|
||||||
float dot_y_dy = 0;
|
float dot_y_dy = 0;
|
||||||
ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
|
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
|
||||||
ggml_vec_cpy_f32 (nc, dx, dy);
|
ggml_vec_cpy_f32 (nc, dx, dy);
|
||||||
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
||||||
ggml_vec_mul_f32 (nc, dx, dx, y);
|
ggml_vec_mul_f32 (nc, dx, dx, y);
|
||||||
|
@ -12369,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
||||||
const int i1n = i10*ne11;
|
const int i1n = i10*ne11;
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f16(ne02, &v,
|
ggml_vec_dot_f16(ne02, &v, 0,
|
||||||
(ggml_fp16_t *) wdata_src + i1n,
|
(ggml_fp16_t *) wdata_src + i1n, 0,
|
||||||
(ggml_fp16_t *) wdata_kernel + i00*ne02);
|
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
|
||||||
dst_data[i10*s0 + i00] += v;
|
dst_data[i10*s0 + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12466,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
||||||
const int i1n = i10*ne11;
|
const int i1n = i10*ne11;
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f32(ne02, &v,
|
ggml_vec_dot_f32(ne02, &v, 0,
|
||||||
wdata_src + i1n,
|
wdata_src + i1n, 0,
|
||||||
wdata_kernel + i00*ne02);
|
wdata_kernel + i00*ne02, 0, 1);
|
||||||
dst_data[i10*s0 + i00] += v;
|
dst_data[i10*s0 + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12783,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
||||||
for (int i01 = 0; i01 < ne01; i01++) {
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f16(ne03, &v,
|
ggml_vec_dot_f16(ne03, &v, 0,
|
||||||
wdata_src + i1n,
|
wdata_src + i1n, 0,
|
||||||
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||||
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13214,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f32(neq0,
|
ggml_vec_dot_f32(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// scale
|
// scale
|
||||||
|
@ -13299,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
||||||
const int iv3 = iq3;
|
const int iv3 = iq3;
|
||||||
|
|
||||||
ggml_vec_dot_f32(masked_begin,
|
ggml_vec_dot_f32(masked_begin,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||||
S);
|
S, 0, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13404,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f16(neq0,
|
ggml_vec_dot_f16(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
||||||
|
@ -13508,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
||||||
const int iv3 = iq3;
|
const int iv3 = iq3;
|
||||||
|
|
||||||
ggml_vec_dot_f16(nev0,
|
ggml_vec_dot_f16(nev0,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||||
S16);
|
S16, 0, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
||||||
|
@ -13652,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
||||||
const int i1 = ib01;
|
const int i1 = ib01;
|
||||||
|
|
||||||
ggml_vec_dot_f16(nea0,
|
ggml_vec_dot_f16(nea0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
|
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
||||||
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
|
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
||||||
|
@ -13677,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
||||||
for (int64_t ic = 0; ic < nec01; ++ic) {
|
for (int64_t ic = 0; ic < nec01; ++ic) {
|
||||||
|
|
||||||
ggml_vec_dot_f16(neb01,
|
ggml_vec_dot_f16(neb01,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
|
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
||||||
S16);
|
S16, 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_add_f32(nec01,
|
ggml_vec_add_f32(nec01,
|
||||||
|
@ -13866,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f32(neq0,
|
ggml_vec_dot_f32(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// scale
|
// scale
|
||||||
|
@ -14013,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
||||||
|
|
||||||
// S = SM * (S - dot(SM, S))
|
// S = SM * (S - dot(SM, S))
|
||||||
float dot_SM_gradSM = 0;
|
float dot_SM_gradSM = 0;
|
||||||
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
|
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
|
||||||
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
||||||
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
||||||
|
|
||||||
|
@ -18382,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute the initial gradient in the search direction
|
// compute the initial gradient in the search direction
|
||||||
ggml_vec_dot_f32(nx, &dginit, g, d);
|
ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
|
||||||
|
|
||||||
// make sure that d points to a descent direction
|
// make sure that d points to a descent direction
|
||||||
if (0 < dginit) {
|
if (0 < dginit) {
|
||||||
|
@ -18432,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_dot_f32(nx, &dg, g, d);
|
ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
|
||||||
|
|
||||||
// check the Wolfe condition
|
// check the Wolfe condition
|
||||||
if (dg < params->lbfgs.wolfe * dginit) {
|
if (dg < params->lbfgs.wolfe * dginit) {
|
||||||
|
@ -18693,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
// ys = y^t \cdot s -> 1 / \rho.
|
// ys = y^t \cdot s -> 1 / \rho.
|
||||||
// yy = y^t \cdot y.
|
// yy = y^t \cdot y.
|
||||||
//
|
//
|
||||||
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
|
||||||
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
|
||||||
|
|
||||||
lm_ys[end[0]] = ys;
|
lm_ys[end[0]] = ys;
|
||||||
|
|
||||||
|
@ -18713,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
for (int i = 0; i < bound; ++i) {
|
for (int i = 0; i < bound; ++i) {
|
||||||
j[0] = (j[0] + m - 1) % m;
|
j[0] = (j[0] + m - 1) % m;
|
||||||
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
|
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
|
||||||
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
|
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
|
||||||
lm_alpha[j[0]] /= lm_ys[j[0]];
|
lm_alpha[j[0]] /= lm_ys[j[0]];
|
||||||
// q_{i} = q_{i+1} - \alpha_{i} y_{i}
|
// q_{i} = q_{i+1} - \alpha_{i} y_{i}
|
||||||
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
|
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
|
||||||
|
@ -18723,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
|
|
||||||
for (int i = 0; i < bound; ++i) {
|
for (int i = 0; i < bound; ++i) {
|
||||||
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
|
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
|
||||||
ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
|
ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
|
||||||
beta /= lm_ys[j[0]];
|
beta /= lm_ys[j[0]];
|
||||||
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
|
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
|
||||||
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
|
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
|
||||||
|
@ -20621,4 +20677,12 @@ int ggml_cpu_has_vsx(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_matmul_int8(void) {
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
5
ggml.h
5
ggml.h
|
@ -2290,6 +2290,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||||
GGML_API int ggml_cpu_has_sycl (void);
|
GGML_API int ggml_cpu_has_sycl (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
@ -2303,7 +2304,8 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||||
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||||
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const char * type_name;
|
const char * type_name;
|
||||||
|
@ -2315,6 +2317,7 @@ extern "C" {
|
||||||
ggml_from_float_t from_float_reference;
|
ggml_from_float_t from_float_reference;
|
||||||
ggml_vec_dot_t vec_dot;
|
ggml_vec_dot_t vec_dot;
|
||||||
enum ggml_type vec_dot_type;
|
enum ggml_type vec_dot_type;
|
||||||
|
int64_t nrows; // number of rows to process simultaneously;
|
||||||
} ggml_type_traits_t;
|
} ggml_type_traits_t;
|
||||||
|
|
||||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||||
|
|
|
@ -50,6 +50,7 @@ class Keys:
|
||||||
VALUE_LENGTH = "{arch}.attention.value_length"
|
VALUE_LENGTH = "{arch}.attention.value_length"
|
||||||
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||||
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||||
|
CAUSAL = "{arch}.attention.causal"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -63,6 +64,7 @@ class Keys:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
|
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
||||||
SCORES = "tokenizer.ggml.scores"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
|
@ -122,6 +124,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_OUT = auto()
|
ATTN_OUT = auto()
|
||||||
ATTN_NORM = auto()
|
ATTN_NORM = auto()
|
||||||
ATTN_NORM_2 = auto()
|
ATTN_NORM_2 = auto()
|
||||||
|
ATTN_OUT_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
|
@ -134,6 +137,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
|
LAYER_OUT_NORM = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -178,6 +182,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
|
@ -187,6 +192,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -262,17 +268,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
],
|
],
|
||||||
MODEL_ARCH.BERT: [
|
MODEL_ARCH.BERT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
MODEL_TENSOR.TOKEN_TYPES,
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
MODEL_TENSOR.POS_EMBD,
|
MODEL_TENSOR.POS_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.ATTN_NORM,
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
MODEL_TENSOR.ATTN_Q,
|
MODEL_TENSOR.ATTN_Q,
|
||||||
MODEL_TENSOR.ATTN_K,
|
MODEL_TENSOR.ATTN_K,
|
||||||
MODEL_TENSOR.ATTN_V,
|
MODEL_TENSOR.ATTN_V,
|
||||||
MODEL_TENSOR.ATTN_OUT,
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
MODEL_TENSOR.FFN_NORM,
|
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.MPT: [
|
MODEL_ARCH.MPT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
|
|
@ -357,6 +357,9 @@ class GGUFWriter:
|
||||||
def add_layer_norm_rms_eps(self, value: float) -> None:
|
def add_layer_norm_rms_eps(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_causal_attention(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_rope_dimension_count(self, count: int) -> None:
|
def add_rope_dimension_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
@ -387,6 +390,9 @@ class GGUFWriter:
|
||||||
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
|
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
|
||||||
self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
|
self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
|
||||||
|
|
||||||
|
def add_token_type_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value)
|
||||||
|
|
||||||
def add_token_scores(self, scores: Sequence[float]) -> None:
|
def add_token_scores(self, scores: Sequence[float]) -> None:
|
||||||
self.add_array(Keys.Tokenizer.SCORES, scores)
|
self.add_array(Keys.Tokenizer.SCORES, scores)
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ class TensorNameMap:
|
||||||
# Normalization of token embeddings
|
# Normalization of token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
"word_embeddings_layernorm", # bloom
|
"word_embeddings_layernorm", # bloom
|
||||||
|
"embeddings.LayerNorm", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
|
@ -54,7 +55,6 @@ class TensorNameMap:
|
||||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||||
"model.norm", # llama-hf baichuan internlm2
|
"model.norm", # llama-hf baichuan internlm2
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"embeddings.LayerNorm", # bert
|
|
||||||
"transformer.norm_f", # mpt
|
"transformer.norm_f", # mpt
|
||||||
"ln_f", # refact bloom qwen gpt2
|
"ln_f", # refact bloom qwen gpt2
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
|
@ -79,7 +79,6 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
|
||||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||||
"model.layers.{bid}.ln1", # yi
|
"model.layers.{bid}.ln1", # yi
|
||||||
"h.{bid}.ln_1", # gpt2
|
"h.{bid}.ln_1", # gpt2
|
||||||
|
@ -155,6 +154,11 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.attention.wo", # internlm2
|
"model.layers.{bid}.attention.wo", # internlm2
|
||||||
),
|
),
|
||||||
|
|
||||||
|
# Attention output norm
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||||
|
@ -171,7 +175,6 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
|
||||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||||
"model.layers.{bid}.ln2", # yi
|
"model.layers.{bid}.ln2", # yi
|
||||||
"h.{bid}.ln_2", # gpt2
|
"h.{bid}.ln_2", # gpt2
|
||||||
|
@ -266,6 +269,10 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||||
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||||
|
|
487
llama.cpp
487
llama.cpp
|
@ -196,6 +196,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_STARCODER,
|
LLM_ARCH_STARCODER,
|
||||||
LLM_ARCH_PERSIMMON,
|
LLM_ARCH_PERSIMMON,
|
||||||
LLM_ARCH_REFACT,
|
LLM_ARCH_REFACT,
|
||||||
|
LLM_ARCH_BERT,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
LLM_ARCH_QWEN,
|
LLM_ARCH_QWEN,
|
||||||
|
@ -220,6 +221,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_STARCODER, "starcoder" },
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
||||||
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
||||||
{ LLM_ARCH_REFACT, "refact" },
|
{ LLM_ARCH_REFACT, "refact" },
|
||||||
|
{ LLM_ARCH_BERT, "bert" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
{ LLM_ARCH_QWEN, "qwen" },
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
|
@ -261,6 +263,7 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_VALUE_LENGTH,
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
||||||
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
||||||
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
||||||
|
LLM_KV_ATTENTION_CAUSAL,
|
||||||
|
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
LLM_KV_ROPE_FREQ_BASE,
|
LLM_KV_ROPE_FREQ_BASE,
|
||||||
|
@ -273,6 +276,7 @@ enum llm_kv {
|
||||||
LLM_KV_TOKENIZER_MODEL,
|
LLM_KV_TOKENIZER_MODEL,
|
||||||
LLM_KV_TOKENIZER_LIST,
|
LLM_KV_TOKENIZER_LIST,
|
||||||
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
||||||
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
||||||
LLM_KV_TOKENIZER_SCORES,
|
LLM_KV_TOKENIZER_SCORES,
|
||||||
LLM_KV_TOKENIZER_MERGES,
|
LLM_KV_TOKENIZER_MERGES,
|
||||||
LLM_KV_TOKENIZER_BOS_ID,
|
LLM_KV_TOKENIZER_BOS_ID,
|
||||||
|
@ -316,6 +320,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||||
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||||
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||||
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
||||||
|
|
||||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||||
|
@ -328,6 +333,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||||
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||||
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
||||||
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
||||||
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
||||||
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
||||||
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
||||||
|
@ -355,6 +361,7 @@ struct LLM_KV {
|
||||||
enum llm_tensor {
|
enum llm_tensor {
|
||||||
LLM_TENSOR_TOKEN_EMBD,
|
LLM_TENSOR_TOKEN_EMBD,
|
||||||
LLM_TENSOR_TOKEN_EMBD_NORM,
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
||||||
|
LLM_TENSOR_TOKEN_TYPES,
|
||||||
LLM_TENSOR_POS_EMBD,
|
LLM_TENSOR_POS_EMBD,
|
||||||
LLM_TENSOR_OUTPUT,
|
LLM_TENSOR_OUTPUT,
|
||||||
LLM_TENSOR_OUTPUT_NORM,
|
LLM_TENSOR_OUTPUT_NORM,
|
||||||
|
@ -536,6 +543,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_BERT,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
|
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
{
|
{
|
||||||
|
@ -1440,6 +1464,11 @@ static llama_state g_state;
|
||||||
// available llama models
|
// available llama models
|
||||||
enum e_model {
|
enum e_model {
|
||||||
MODEL_UNKNOWN,
|
MODEL_UNKNOWN,
|
||||||
|
MODEL_17M,
|
||||||
|
MODEL_22M,
|
||||||
|
MODEL_33M,
|
||||||
|
MODEL_109M,
|
||||||
|
MODEL_335M,
|
||||||
MODEL_0_5B,
|
MODEL_0_5B,
|
||||||
MODEL_1B,
|
MODEL_1B,
|
||||||
MODEL_2B,
|
MODEL_2B,
|
||||||
|
@ -1481,6 +1510,7 @@ struct llama_hparams {
|
||||||
uint32_t n_ff;
|
uint32_t n_ff;
|
||||||
uint32_t n_expert = 0;
|
uint32_t n_expert = 0;
|
||||||
uint32_t n_expert_used = 0;
|
uint32_t n_expert_used = 0;
|
||||||
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
||||||
|
|
||||||
float f_norm_eps;
|
float f_norm_eps;
|
||||||
float f_norm_rms_eps;
|
float f_norm_rms_eps;
|
||||||
|
@ -1493,6 +1523,8 @@ struct llama_hparams {
|
||||||
float f_clamp_kqv;
|
float f_clamp_kqv;
|
||||||
float f_max_alibi_bias;
|
float f_max_alibi_bias;
|
||||||
|
|
||||||
|
bool causal_attn = true;
|
||||||
|
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
if (this->vocab_only != other.vocab_only) return true;
|
if (this->vocab_only != other.vocab_only) return true;
|
||||||
|
@ -1720,6 +1752,7 @@ struct llama_model {
|
||||||
llama_vocab vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
struct ggml_tensor * tok_embd;
|
struct ggml_tensor * tok_embd;
|
||||||
|
struct ggml_tensor * type_embd;
|
||||||
struct ggml_tensor * pos_embd;
|
struct ggml_tensor * pos_embd;
|
||||||
struct ggml_tensor * tok_norm;
|
struct ggml_tensor * tok_norm;
|
||||||
struct ggml_tensor * tok_norm_b;
|
struct ggml_tensor * tok_norm_b;
|
||||||
|
@ -1848,6 +1881,7 @@ struct llama_context {
|
||||||
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||||
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
||||||
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
||||||
|
struct ggml_tensor * inp_sum; // F32 [1, n_batch]
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
ggml_mpi_context * ctx_mpi = NULL;
|
ggml_mpi_context * ctx_mpi = NULL;
|
||||||
|
@ -2827,6 +2861,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||||
|
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||||
default: return "unknown";
|
default: return "unknown";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2998,6 +3033,26 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BERT:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 3:
|
||||||
|
model.type = e_model::MODEL_17M; break; // bge-micro
|
||||||
|
case 6:
|
||||||
|
model.type = e_model::MODEL_22M; break; // MiniLM-L6
|
||||||
|
case 12:
|
||||||
|
switch (hparams.n_embd) {
|
||||||
|
case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
|
||||||
|
case 768: model.type = e_model::MODEL_109M; break; // bge-base
|
||||||
|
} break;
|
||||||
|
case 24:
|
||||||
|
model.type = e_model::MODEL_335M; break; // bge-large
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
@ -3202,6 +3257,16 @@ static void llm_load_vocab(
|
||||||
vocab.special_unk_id = -1;
|
vocab.special_unk_id = -1;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = -1;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = -1;
|
||||||
|
} else if (tokenizer_name == "bert") {
|
||||||
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
||||||
|
|
||||||
|
// default special tokens
|
||||||
|
vocab.special_bos_id = 101;
|
||||||
|
vocab.special_eos_id = 102;
|
||||||
|
vocab.special_unk_id = 100;
|
||||||
|
vocab.special_sep_id = -1;
|
||||||
|
vocab.special_pad_id = -1;
|
||||||
|
vocab.add_space_prefix = false;
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
||||||
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
||||||
|
@ -3230,6 +3295,8 @@ static void llm_load_vocab(
|
||||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||||
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||||
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
||||||
|
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||||||
|
vocab.linefeed_id = vocab.special_pad_id;
|
||||||
} else {
|
} else {
|
||||||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
||||||
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
||||||
|
@ -3567,6 +3634,7 @@ static bool llm_load_tensors(
|
||||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
const int64_t n_vocab_type = hparams.n_vocab_type;
|
||||||
const int64_t n_ff = hparams.n_ff;
|
const int64_t n_ff = hparams.n_ff;
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||||
|
@ -3781,11 +3849,50 @@ static bool llm_load_tensors(
|
||||||
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BERT:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
||||||
|
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
||||||
|
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
||||||
|
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
||||||
|
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
||||||
|
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||||
|
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
||||||
model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
||||||
|
|
||||||
// output
|
// output
|
||||||
{
|
{
|
||||||
|
@ -4737,6 +4844,7 @@ struct llm_build_context {
|
||||||
const int32_t n_orig_ctx;
|
const int32_t n_orig_ctx;
|
||||||
|
|
||||||
const bool do_rope_shift;
|
const bool do_rope_shift;
|
||||||
|
const bool causal_attn;
|
||||||
|
|
||||||
const llm_build_cb & cb;
|
const llm_build_cb & cb;
|
||||||
|
|
||||||
|
@ -4780,6 +4888,7 @@ struct llm_build_context {
|
||||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||||
do_rope_shift (worst_case || kv_self.has_shift),
|
do_rope_shift (worst_case || kv_self.has_shift),
|
||||||
|
causal_attn (hparams.causal_attn),
|
||||||
cb (cb),
|
cb (cb),
|
||||||
buf_compute_meta (lctx.buf_compute_meta) {
|
buf_compute_meta (lctx.buf_compute_meta) {
|
||||||
// all initializations should be done in init()
|
// all initializations should be done in init()
|
||||||
|
@ -5623,6 +5732,100 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_bert() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
// get input vectors with right size
|
||||||
|
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||||
|
struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0);
|
||||||
|
|
||||||
|
// construct input embeddings (token, type, position)
|
||||||
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||||
|
// token types are hardcoded to zero ("Sentence A")
|
||||||
|
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
||||||
|
inpL = ggml_add(ctx0, inpL, type_row0);
|
||||||
|
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
||||||
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
|
// embed layer norm
|
||||||
|
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
||||||
|
cb(inpL, "inp_norm", -1);
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
|
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
||||||
|
|
||||||
|
// iterate layers
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * cur = inpL;
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
// seems like we just need to do this for Q?
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
cb(cur, "kqv_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// re-add the layer input
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
|
||||||
|
// attention layer norm
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||||
|
NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
// attentions bypass the intermediate layer
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
|
||||||
|
// output layer norm
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
// final output
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
// pooling
|
||||||
|
cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
|
||||||
|
cb(cur, "result_embed", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_bloom() {
|
struct ggml_cgraph * build_bloom() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -7045,6 +7248,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_refact();
|
result = llm.build_refact();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BERT:
|
||||||
|
{
|
||||||
|
result = llm.build_bert();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
{
|
{
|
||||||
result = llm.build_bloom();
|
result = llm.build_bloom();
|
||||||
|
@ -7156,6 +7363,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
||||||
|
float * data = (float *) lctx.inp_sum->data;
|
||||||
|
|
||||||
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
data[i] = 1.0f/float(batch.n_tokens);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (kv_self.has_shift) {
|
if (kv_self.has_shift) {
|
||||||
const int64_t n_ctx = cparams.n_ctx;
|
const int64_t n_ctx = cparams.n_ctx;
|
||||||
|
|
||||||
|
@ -7271,14 +7488,19 @@ static int llama_decode_internal(
|
||||||
|
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
|
||||||
|
|
||||||
// the embeddings could be the second to last tensor, or the third to last tensor
|
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
||||||
|
if (strcmp(res->name, "result_output") == 0) {
|
||||||
|
// the embeddings could be the second to last tensor, or the third to last tensor
|
||||||
if (strcmp(embeddings->name, "result_norm") != 0) {
|
if (strcmp(embeddings->name, "result_norm") != 0) {
|
||||||
embeddings = gf->nodes[gf->n_nodes - 3];
|
embeddings = gf->nodes[gf->n_nodes - 3];
|
||||||
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
||||||
}
|
}
|
||||||
|
} else if (strcmp(res->name, "result_embed") == 0) {
|
||||||
|
embeddings = res;
|
||||||
|
res = nullptr;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
|
||||||
|
@ -7349,7 +7571,7 @@ static int llama_decode_internal(
|
||||||
// extract logits
|
// extract logits
|
||||||
// TODO: do not compute and extract logits if only embeddings are needed
|
// TODO: do not compute and extract logits if only embeddings are needed
|
||||||
// need to update the graphs to skip "result_output"
|
// need to update the graphs to skip "result_output"
|
||||||
{
|
if (res) {
|
||||||
auto & logits_out = lctx.logits;
|
auto & logits_out = lctx.logits;
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
|
@ -7393,9 +7615,11 @@ static int llama_decode_internal(
|
||||||
if (!lctx.embedding.empty()) {
|
if (!lctx.embedding.empty()) {
|
||||||
auto & embedding_out = lctx.embedding;
|
auto & embedding_out = lctx.embedding;
|
||||||
|
|
||||||
|
const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0;
|
||||||
|
|
||||||
embedding_out.resize(n_embd);
|
embedding_out.resize(n_embd);
|
||||||
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
||||||
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
|
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float));
|
||||||
ggml_backend_synchronize(embeddings_backend);
|
ggml_backend_synchronize(embeddings_backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7459,6 +7683,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
return unicode_to_bytes_bpe(token_data.text);
|
return unicode_to_bytes_bpe(token_data.text);
|
||||||
}
|
}
|
||||||
|
case LLAMA_VOCAB_TYPE_WPM: {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -7471,6 +7698,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||||
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
||||||
return vocab.token_to_id.at(buf);
|
return vocab.token_to_id.at(buf);
|
||||||
}
|
}
|
||||||
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
case LLAMA_VOCAB_TYPE_BPE: {
|
case LLAMA_VOCAB_TYPE_BPE: {
|
||||||
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
||||||
}
|
}
|
||||||
|
@ -7941,12 +8169,212 @@ private:
|
||||||
llm_bigram_bpe::queue work_queue;
|
llm_bigram_bpe::queue work_queue;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
|
struct llm_tokenizer_wpm {
|
||||||
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
||||||
|
|
||||||
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
|
auto * token_map = &vocab.token_to_id;
|
||||||
|
|
||||||
|
// normalize and split by whitespace
|
||||||
|
std::vector<std::string> words = preprocess(text);
|
||||||
|
|
||||||
|
// bos token prepended already
|
||||||
|
|
||||||
|
// find the longest tokens that form the words
|
||||||
|
for (const std::string &word : words) {
|
||||||
|
// skip empty words
|
||||||
|
if (word.size() == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepend phantom space
|
||||||
|
std::string word1 = "\xe2\x96\x81" + word;
|
||||||
|
int n = word1.size();
|
||||||
|
|
||||||
|
// we're at the start of a new word
|
||||||
|
int i = 0;
|
||||||
|
bool match_any = false;
|
||||||
|
|
||||||
|
// move through character position in word
|
||||||
|
while (i < n) {
|
||||||
|
// loop through possible match length
|
||||||
|
bool match = false;
|
||||||
|
for (int j = n; j > i; j--) {
|
||||||
|
auto it = token_map->find(word1.substr(i, j - i));
|
||||||
|
if (it != token_map->end()) {
|
||||||
|
output.push_back(it->second);
|
||||||
|
match = true;
|
||||||
|
match_any = true;
|
||||||
|
i = j;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// must be an unknown character
|
||||||
|
if (!match) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we didn't find any matches for this word
|
||||||
|
if (!match_any) {
|
||||||
|
output.push_back(vocab.special_unk_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// append eos token
|
||||||
|
output.push_back(vocab.special_eos_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> preprocess(const std::string & text) {
|
||||||
|
std::string ori_str = normalize(text);
|
||||||
|
uint64_t ori_size = ori_str.size();
|
||||||
|
|
||||||
|
// single punct / single symbol / single digit
|
||||||
|
// baseline: add whitespace on the left and right of punct and chinese characters
|
||||||
|
std::vector<std::string> words;
|
||||||
|
std::string new_str = "";
|
||||||
|
uint64_t i = 0;
|
||||||
|
while (i < ori_size) {
|
||||||
|
int utf_char_len = utf8_len(ori_str[i]);
|
||||||
|
if ((utf_char_len == 1) && ispunct(ori_str[i])) {
|
||||||
|
new_str += " ";
|
||||||
|
new_str += ori_str[i];
|
||||||
|
new_str += " ";
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
|
||||||
|
new_str += " ";
|
||||||
|
new_str += ori_str.substr(i, 3);
|
||||||
|
new_str += " ";
|
||||||
|
i += 3;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
new_str += ori_str[i];
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// split by whitespace
|
||||||
|
uint64_t l = 0;
|
||||||
|
uint64_t r = 0;
|
||||||
|
while (r < new_str.size()) {
|
||||||
|
// if is whitespace
|
||||||
|
if (isspace(new_str[r])) {
|
||||||
|
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
||||||
|
l = r + 1;
|
||||||
|
r = l;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
r += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (r > l) {
|
||||||
|
words.push_back(new_str.substr(l, (r - l)));
|
||||||
|
}
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string normalize(const std::string & text) {
|
||||||
|
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
|
||||||
|
std::string text2 = strip_accents(text);
|
||||||
|
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
|
||||||
|
char c = text2[i];
|
||||||
|
if (c >= 'A' && c <= 'Z') {
|
||||||
|
text2[i] = c - 'A' + 'a';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return text2;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_chinese_char(const std::string & str) {
|
||||||
|
int len = str.length();
|
||||||
|
unsigned int codepoint = 0;
|
||||||
|
int num_bytes = 0;
|
||||||
|
int i = 0;
|
||||||
|
unsigned char ch = static_cast<unsigned char>(str[i]);
|
||||||
|
if (ch <= 0x7f) {
|
||||||
|
codepoint = ch;
|
||||||
|
num_bytes = 1;
|
||||||
|
} else if ((ch >> 5) == 0x06) {
|
||||||
|
codepoint = ch & 0x1f;
|
||||||
|
num_bytes = 2;
|
||||||
|
} else if ((ch >> 4) == 0x0e) {
|
||||||
|
codepoint = ch & 0x0f;
|
||||||
|
num_bytes = 3;
|
||||||
|
} else if ((ch >> 3) == 0x1e) {
|
||||||
|
codepoint = ch & 0x07;
|
||||||
|
num_bytes = 4;
|
||||||
|
}
|
||||||
|
for (int j = 1; j < num_bytes; ++j) {
|
||||||
|
if (i + j >= len) {
|
||||||
|
return false; // incomplete UTF-8 character
|
||||||
|
}
|
||||||
|
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
||||||
|
if ((next_ch >> 6) != 0x02) {
|
||||||
|
return false; // invalid trailing byte
|
||||||
|
}
|
||||||
|
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
||||||
|
}
|
||||||
|
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
||||||
|
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
||||||
|
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
||||||
|
(codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
|
||||||
|
(codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
|
||||||
|
(codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
||||||
|
(codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
|
||||||
|
(codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
|
||||||
|
(codepoint >= 0x3000 && codepoint <= 0x303F) ||
|
||||||
|
(codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
|
||||||
|
return true; // NOLINT
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string strip_accents(const std::string & input_string) {
|
||||||
|
std::string resultString;
|
||||||
|
std::map<std::string, char> accent_map = {
|
||||||
|
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
||||||
|
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
||||||
|
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
||||||
|
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
||||||
|
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
||||||
|
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
||||||
|
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
||||||
|
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
||||||
|
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (size_t i = 0; i < input_string.length();) {
|
||||||
|
int len = utf8_len(input_string[i]);
|
||||||
|
std::string curChar = input_string.substr(i, len);
|
||||||
|
auto iter = accent_map.find(curChar);
|
||||||
|
if (iter != accent_map.end()) {
|
||||||
|
resultString += iter->second;
|
||||||
|
} else {
|
||||||
|
resultString += curChar;
|
||||||
|
}
|
||||||
|
i += len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultString;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t utf8_len(char src) {
|
||||||
|
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
||||||
|
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||||
|
return lookup[highbits];
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_vocab & vocab;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
||||||
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
||||||
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
||||||
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
||||||
|
|
||||||
struct fragment_buffer_variant{
|
struct fragment_buffer_variant {
|
||||||
fragment_buffer_variant(llama_vocab::id _token)
|
fragment_buffer_variant(llama_vocab::id _token)
|
||||||
:
|
:
|
||||||
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
||||||
|
@ -7976,8 +8404,7 @@ struct fragment_buffer_variant{
|
||||||
|
|
||||||
// #define PRETOKENIZERDEBUG
|
// #define PRETOKENIZERDEBUG
|
||||||
|
|
||||||
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
||||||
{
|
|
||||||
// for each special token
|
// for each special token
|
||||||
for (const auto & st: vocab.special_tokens_cache) {
|
for (const auto & st: vocab.special_tokens_cache) {
|
||||||
const auto & special_token = st.first;
|
const auto & special_token = st.first;
|
||||||
|
@ -8095,10 +8522,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
switch (vocab.type) {
|
switch (vocab.type) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM:
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
{
|
{
|
||||||
for (const auto & fragment: fragment_buffer)
|
for (const auto & fragment: fragment_buffer) {
|
||||||
{
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
|
||||||
{
|
|
||||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||||
|
|
||||||
// TODO: It's likely possible to get rid of this string copy entirely
|
// TODO: It's likely possible to get rid of this string copy entirely
|
||||||
|
@ -8118,19 +8543,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
llm_tokenizer_spm tokenizer(vocab);
|
llm_tokenizer_spm tokenizer(vocab);
|
||||||
llama_escape_whitespace(raw_text);
|
llama_escape_whitespace(raw_text);
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
}
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
||||||
{
|
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_BPE:
|
case LLAMA_VOCAB_TYPE_BPE:
|
||||||
{
|
{
|
||||||
for (const auto & fragment: fragment_buffer)
|
for (const auto & fragment: fragment_buffer) {
|
||||||
{
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
|
||||||
{
|
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
|
@ -8138,9 +8559,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
#endif
|
#endif
|
||||||
llm_tokenizer_bpe tokenizer(vocab);
|
llm_tokenizer_bpe tokenizer(vocab);
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
}
|
||||||
|
} break;
|
||||||
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
{
|
{
|
||||||
|
for (const auto & fragment: fragment_buffer) {
|
||||||
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
|
||||||
|
#ifdef PRETOKENIZERDEBUG
|
||||||
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
||||||
|
#endif
|
||||||
|
llm_tokenizer_wpm tokenizer(vocab);
|
||||||
|
tokenizer.tokenize(raw_text, output);
|
||||||
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10804,7 +11239,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// graph inputs
|
// graph inputs
|
||||||
{
|
{
|
||||||
ggml_init_params init_params = {
|
ggml_init_params init_params = {
|
||||||
/* .mem_size */ ggml_tensor_overhead()*5,
|
/* .mem_size */ ggml_tensor_overhead()*7,
|
||||||
/* .mem_buffer */ nullptr,
|
/* .mem_buffer */ nullptr,
|
||||||
/* .no_alloc */ true,
|
/* .no_alloc */ true,
|
||||||
};
|
};
|
||||||
|
@ -10815,12 +11250,14 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||||
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
||||||
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
||||||
|
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
|
||||||
|
|
||||||
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
||||||
ggml_set_name(ctx->inp_embd, "inp_embd");
|
ggml_set_name(ctx->inp_embd, "inp_embd");
|
||||||
ggml_set_name(ctx->inp_pos, "inp_pos");
|
ggml_set_name(ctx->inp_pos, "inp_pos");
|
||||||
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
||||||
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
||||||
|
ggml_set_name(ctx->inp_sum, "inp_sum");
|
||||||
|
|
||||||
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||||
|
|
||||||
|
@ -11755,6 +12192,7 @@ static std::string llama_decode_text(const std::string & text) {
|
||||||
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
||||||
if (0 <= token && token < llama_n_vocab(model)) {
|
if (0 <= token && token < llama_n_vocab(model)) {
|
||||||
switch (llama_vocab_get_type(model->vocab)) {
|
switch (llama_vocab_get_type(model->vocab)) {
|
||||||
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
case LLAMA_VOCAB_TYPE_SPM: {
|
case LLAMA_VOCAB_TYPE_SPM: {
|
||||||
// NOTE: we accept all unsupported token types,
|
// NOTE: we accept all unsupported token types,
|
||||||
// suppressing them like CONTROL tokens.
|
// suppressing them like CONTROL tokens.
|
||||||
|
@ -11878,6 +12316,7 @@ const char * llama_print_system_info(void) {
|
||||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||||
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -61,6 +61,7 @@ extern "C" {
|
||||||
enum llama_vocab_type {
|
enum llama_vocab_type {
|
||||||
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
||||||
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
||||||
|
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_token_type {
|
enum llama_token_type {
|
||||||
|
|
|
@ -156,8 +156,8 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
float fs;
|
float fs;
|
||||||
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data());
|
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
|
||||||
else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data());
|
else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||||
if (iloop > 3) ggml.addResult(fs, t);
|
if (iloop > 3) ggml.addResult(fs, t);
|
||||||
|
|
|
@ -284,8 +284,8 @@ int main(int argc, char** argv) {
|
||||||
else {
|
else {
|
||||||
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
|
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
|
||||||
vdot.from_float(y1.data(), q8.data(), kVecSize);
|
vdot.from_float(y1.data(), q8.data(), kVecSize);
|
||||||
if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data());
|
if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||||
else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data());
|
else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||||
}
|
}
|
||||||
sumq += result;
|
sumq += result;
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
|
@ -87,7 +87,7 @@ static float dot_product_error(
|
||||||
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
||||||
|
|
||||||
float result = INFINITY;
|
float result = INFINITY;
|
||||||
qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());
|
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||||
|
|
||||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||||
|
|
||||||
|
|
|
@ -346,7 +346,7 @@ int main(int argc, char * argv[]) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void) -> float {
|
auto quantize_fn = [&](void) -> float {
|
||||||
float result;
|
float result;
|
||||||
qfns.vec_dot(size, &result, test_q1, test_q2);
|
qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
size_t quantized_size = ggml_row_size(type, size);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue