Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
8e8d76cb39
31 changed files with 868 additions and 197 deletions
|
@ -124,6 +124,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||||
|
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
||||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||||
|
|
|
@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.samplers_sequence = parse_samplers_input(argv[i]);
|
const auto sampler_names = string_split(argv[i], ';');
|
||||||
|
sparams.samplers_sequence = sampler_types_from_names(sampler_names);
|
||||||
} else if (arg == "--sampling-seq") {
|
} else if (arg == "--sampling-seq") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.samplers_sequence = argv[i];
|
sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
|
||||||
} else if (arg == "--top-p") {
|
} else if (arg == "--top-p") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -906,6 +907,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
const llama_sampling_params & sparams = params.sparams;
|
const llama_sampling_params & sparams = params.sparams;
|
||||||
|
|
||||||
|
std::string sampler_type_chars;
|
||||||
|
std::string sampler_type_names;
|
||||||
|
for (const auto sampler_type : sparams.samplers_sequence) {
|
||||||
|
sampler_type_chars += static_cast<char>(sampler_type);
|
||||||
|
sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
|
||||||
|
}
|
||||||
|
sampler_type_names.pop_back();
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -947,8 +956,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
|
printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str());
|
||||||
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
|
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
|
@ -1097,45 +1106,85 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// String parsing
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::string parse_samplers_input(std::string input) {
|
std::vector<std::string> string_split(std::string input, char separator) {
|
||||||
std::string output = "";
|
std::vector<std::string> parts;
|
||||||
|
size_t separator_pos = input.find(separator);
|
||||||
|
while (separator_pos != std::string::npos) {
|
||||||
|
std::string part = input.substr(0, separator_pos);
|
||||||
|
parts.emplace_back(part);
|
||||||
|
input = input.substr(separator_pos + 1);
|
||||||
|
separator_pos = input.find(separator);
|
||||||
|
}
|
||||||
|
parts.emplace_back(input);
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names) {
|
||||||
// since samplers names are written multiple ways
|
// since samplers names are written multiple ways
|
||||||
// make it ready for both system names and input names
|
// make it ready for both system names and input names
|
||||||
std::unordered_map<std::string, char> samplers_symbols {
|
std::unordered_map<std::string, llama_sampler_type> sampler_name_map {
|
||||||
{"top_k", 'k'},
|
{"top_k", llama_sampler_type::TOP_K},
|
||||||
{"top-k", 'k'},
|
{"top-k", llama_sampler_type::TOP_K},
|
||||||
{"top_p", 'p'},
|
{"top_p", llama_sampler_type::TOP_P},
|
||||||
{"top-p", 'p'},
|
{"top-p", llama_sampler_type::TOP_P},
|
||||||
{"nucleus", 'p'},
|
{"nucleus", llama_sampler_type::TOP_P},
|
||||||
{"typical_p", 'y'},
|
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||||
{"typical-p", 'y'},
|
{"typical-p", llama_sampler_type::TYPICAL_P},
|
||||||
{"typical", 'y'},
|
{"typical", llama_sampler_type::TYPICAL_P},
|
||||||
{"min_p", 'm'},
|
{"min_p", llama_sampler_type::MIN_P},
|
||||||
{"min-p", 'm'},
|
{"min-p", llama_sampler_type::MIN_P},
|
||||||
{"tfs_z", 'f'},
|
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||||
{"tfs-z", 'f'},
|
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||||
{"tfs", 'f'},
|
{"tfs", llama_sampler_type::TFS_Z},
|
||||||
{"temp", 't'},
|
{"temp", llama_sampler_type::TEMP},
|
||||||
{"temperature",'t'}
|
{"temperature", llama_sampler_type::TEMP}
|
||||||
};
|
};
|
||||||
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
|
|
||||||
size_t separator = input.find(';');
|
|
||||||
while (separator != input.npos) {
|
|
||||||
std::string name = input.substr(0,separator);
|
|
||||||
input = input.substr(separator+1);
|
|
||||||
separator = input.find(';');
|
|
||||||
|
|
||||||
if (samplers_symbols.find(name) != samplers_symbols.end()) {
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
output += samplers_symbols[name];
|
sampler_types.reserve(names.size());
|
||||||
|
for (const auto& name : names) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(name);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (samplers_symbols.find(input) != samplers_symbols.end()) {
|
return sampler_types;
|
||||||
output += samplers_symbols[input];
|
}
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
|
||||||
|
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
||||||
|
{'k', llama_sampler_type::TOP_K},
|
||||||
|
{'p', llama_sampler_type::TOP_P},
|
||||||
|
{'y', llama_sampler_type::TYPICAL_P},
|
||||||
|
{'m', llama_sampler_type::MIN_P},
|
||||||
|
{'f', llama_sampler_type::TFS_Z},
|
||||||
|
{'t', llama_sampler_type::TEMP}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<llama_sampler_type> sampler_types;
|
||||||
|
sampler_types.reserve(names_string.size());
|
||||||
|
for (const auto & c : names_string) {
|
||||||
|
const auto sampler_item = sampler_name_map.find(c);
|
||||||
|
if (sampler_item != sampler_name_map.end()) {
|
||||||
|
sampler_types.push_back(sampler_item->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sampler_types;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
|
||||||
|
switch (sampler_type) {
|
||||||
|
case llama_sampler_type::TOP_K: return "top_k";
|
||||||
|
case llama_sampler_type::TFS_Z: return "tfs_z";
|
||||||
|
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
||||||
|
case llama_sampler_type::TOP_P: return "top_p";
|
||||||
|
case llama_sampler_type::MIN_P: return "min_p";
|
||||||
|
case llama_sampler_type::TEMP: return "temp";
|
||||||
|
default : return "";
|
||||||
}
|
}
|
||||||
return output;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -1550,6 +1599,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||||
|
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
||||||
|
|
||||||
#ifdef NDEBUG
|
#ifdef NDEBUG
|
||||||
fprintf(stream, "debug: false\n");
|
fprintf(stream, "debug: false\n");
|
||||||
|
|
|
@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
void process_escapes(std::string& input);
|
void process_escapes(std::string& input);
|
||||||
|
|
||||||
//
|
//
|
||||||
// String parsing
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::string parse_samplers_input(std::string input);
|
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names);
|
||||||
|
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
|
||||||
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
|
|
@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||||
std::string result = "CFG -> Penalties ";
|
std::string result = "CFG -> Penalties ";
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (auto s : params.samplers_sequence) {
|
for (auto sampler_type : params.samplers_sequence) {
|
||||||
switch (s) {
|
const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
|
||||||
case 'k': result += "-> top_k "; break;
|
if (!sampler_type_name.empty()) {
|
||||||
case 'f': result += "-> tfs_z "; break;
|
result += "-> " + sampler_type_name + " ";
|
||||||
case 'y': result += "-> typical_p "; break;
|
|
||||||
case 'p': result += "-> top_p "; break;
|
|
||||||
case 'm': result += "-> min_p "; break;
|
|
||||||
case 't': result += "-> temp "; break;
|
|
||||||
default : break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -127,8 +122,6 @@ static void sampler_queue(
|
||||||
const llama_sampling_params & params,
|
const llama_sampling_params & params,
|
||||||
llama_token_data_array & cur_p,
|
llama_token_data_array & cur_p,
|
||||||
size_t & min_keep) {
|
size_t & min_keep) {
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
|
||||||
|
|
||||||
const float temp = params.temp;
|
const float temp = params.temp;
|
||||||
const float dynatemp_range = params.dynatemp_range;
|
const float dynatemp_range = params.dynatemp_range;
|
||||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||||
|
@ -137,16 +130,16 @@ static void sampler_queue(
|
||||||
const float min_p = params.min_p;
|
const float min_p = params.min_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
const std::string & samplers_sequence = params.samplers_sequence;
|
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
||||||
|
|
||||||
for (auto s : samplers_sequence) {
|
for (auto sampler_type : samplers_sequence) {
|
||||||
switch (s){
|
switch (sampler_type) {
|
||||||
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
||||||
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
||||||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||||
case 't':
|
case llama_sampler_type::TEMP:
|
||||||
if (dynatemp_range > 0) {
|
if (dynatemp_range > 0) {
|
||||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||||
|
|
|
@ -8,6 +8,16 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// sampler types
|
||||||
|
enum class llama_sampler_type : char {
|
||||||
|
TOP_K = 'k',
|
||||||
|
TOP_P = 'p',
|
||||||
|
MIN_P = 'm',
|
||||||
|
TFS_Z = 'f',
|
||||||
|
TYPICAL_P = 'y',
|
||||||
|
TEMP = 't'
|
||||||
|
};
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
typedef struct llama_sampling_params {
|
typedef struct llama_sampling_params {
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
|
@ -28,7 +38,15 @@ typedef struct llama_sampling_params {
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||||
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
|
|
||||||
|
std::vector<llama_sampler_type> samplers_sequence = {
|
||||||
|
llama_sampler_type::TOP_K,
|
||||||
|
llama_sampler_type::TFS_Z,
|
||||||
|
llama_sampler_type::TYPICAL_P,
|
||||||
|
llama_sampler_type::TOP_P,
|
||||||
|
llama_sampler_type::MIN_P,
|
||||||
|
llama_sampler_type::TEMP
|
||||||
|
};
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
|
|
@ -29,19 +29,25 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
|
||||||
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
2. Install the required Python packages:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pip install -r examples/llava/requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
|
python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
|
4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./convert.py ../llava-v1.5-7b
|
python ./convert.py ../llava-v1.5-7b
|
||||||
|
|
|
@ -42,5 +42,5 @@ if len(clip_tensors) > 0:
|
||||||
torch.save(checkpoint, path)
|
torch.save(checkpoint, path)
|
||||||
|
|
||||||
print("Done!")
|
print("Done!")
|
||||||
print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
|
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||||
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
|
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
|
||||||
|
|
3
examples/llava/requirements.txt
Normal file
3
examples/llava/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
-r ../../requirements/requirements-convert.txt
|
||||||
|
pillow~=10.2.0
|
||||||
|
torch~=2.1.1
|
|
@ -1,7 +1,9 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -73,6 +75,8 @@ int main(int argc, char ** argv){
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
|
|
||||||
|
int64_t t_draft_us = 0;
|
||||||
|
|
||||||
int n_past = inp.size();
|
int n_past = inp.size();
|
||||||
|
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
@ -160,7 +164,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// generate n_pred tokens through prompt lookup
|
// generate n_pred tokens through prompt lookup
|
||||||
auto prompt_lookup = [&]() -> void {
|
auto prompt_lookup = [&]() -> void {
|
||||||
int inp_size = inp.size();
|
const int inp_size = inp.size();
|
||||||
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
|
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
|
||||||
const llama_token * ngram = &inp[inp_size - ngram_size];
|
const llama_token * ngram = &inp[inp_size - ngram_size];
|
||||||
|
|
||||||
|
@ -191,8 +195,12 @@ int main(int argc, char ** argv){
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
|
|
||||||
prompt_lookup();
|
prompt_lookup();
|
||||||
|
|
||||||
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
|
|
||||||
llama_decode(ctx, batch_tgt);
|
llama_decode(ctx, batch_tgt);
|
||||||
++n_past;
|
++n_past;
|
||||||
|
|
||||||
|
@ -210,6 +218,8 @@ int main(int argc, char ** argv){
|
||||||
LOG_TEE("n_draft = %d\n", n_draft);
|
LOG_TEE("n_draft = %d\n", n_draft);
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_TEE("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||||
|
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||||
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
|
|
|
@ -98,7 +98,7 @@ static void write_logfile(
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
static void sigint_handler(int signo) {
|
static void sigint_handler(int signo) {
|
||||||
if (signo == SIGINT) {
|
if (signo == SIGINT) {
|
||||||
if (!is_interacting) {
|
if (!is_interacting && g_params->interactive) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
|
@ -392,7 +392,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.interactive) {
|
// ctrl+C handling
|
||||||
|
{
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
struct sigaction sigint_action;
|
struct sigaction sigint_action;
|
||||||
sigint_action.sa_handler = sigint_handler;
|
sigint_action.sa_handler = sigint_handler;
|
||||||
|
@ -405,7 +406,9 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.interactive) {
|
||||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
LOG_TEE("%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
|
|
|
@ -185,7 +185,7 @@ node index.js
|
||||||
|
|
||||||
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
|
||||||
|
|
||||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
|
||||||
|
|
||||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||||
|
|
||||||
|
|
|
@ -15,9 +15,13 @@
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
inline static json oaicompat_completion_params_parse(
|
inline static json oaicompat_completion_params_parse(
|
||||||
const json &body /* openai api json semantics */)
|
const json &body, /* openai api json semantics */
|
||||||
|
const std::string &chat_template)
|
||||||
{
|
{
|
||||||
json llama_params;
|
json llama_params;
|
||||||
|
std::string formatted_prompt = chat_template == "chatml"
|
||||||
|
? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...)
|
||||||
|
: format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...)
|
||||||
|
|
||||||
llama_params["__oaicompat"] = true;
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
|
@ -30,7 +34,7 @@ inline static json oaicompat_completion_params_parse(
|
||||||
// https://platform.openai.com/docs/api-reference/chat/create
|
// https://platform.openai.com/docs/api-reference/chat/create
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
llama_params["prompt"] = formatted_prompt;
|
||||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||||
|
|
|
@ -36,6 +36,7 @@ struct server_params
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
std::string public_path = "examples/server/public";
|
std::string public_path = "examples/server/public";
|
||||||
|
std::string chat_template = "chatml";
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
|
@ -625,18 +626,36 @@ struct llama_server_context
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
for (const auto &el : *logit_bias)
|
for (const auto &el : *logit_bias)
|
||||||
{
|
{
|
||||||
if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
if (el.is_array() && el.size() == 2)
|
||||||
|
{
|
||||||
|
float bias;
|
||||||
|
if (el[1].is_number())
|
||||||
|
{
|
||||||
|
bias = el[1].get<float>();
|
||||||
|
}
|
||||||
|
else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||||
|
{
|
||||||
|
bias = -INFINITY;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (el[0].is_number_integer())
|
||||||
{
|
{
|
||||||
llama_token tok = el[0].get<llama_token>();
|
llama_token tok = el[0].get<llama_token>();
|
||||||
if (tok >= 0 && tok < n_vocab)
|
if (tok >= 0 && tok < n_vocab)
|
||||||
{
|
{
|
||||||
if (el[1].is_number())
|
slot->sparams.logit_bias[tok] = bias;
|
||||||
{
|
|
||||||
slot->sparams.logit_bias[tok] = el[1].get<float>();
|
|
||||||
}
|
}
|
||||||
else if (el[1].is_boolean() && !el[1].get<bool>())
|
}
|
||||||
|
else if (el[0].is_string())
|
||||||
{
|
{
|
||||||
slot->sparams.logit_bias[tok] = -INFINITY;
|
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||||
|
for (auto tok : toks)
|
||||||
|
{
|
||||||
|
slot->sparams.logit_bias[tok] = bias;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1592,10 +1611,6 @@ struct llama_server_context
|
||||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
|
||||||
|
|
||||||
slot.cache_tokens = prompt_tokens;
|
slot.cache_tokens = prompt_tokens;
|
||||||
|
|
||||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
||||||
|
@ -1609,6 +1624,10 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
||||||
|
|
||||||
|
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", slot.n_past},
|
{"n_past", slot.n_past},
|
||||||
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
||||||
|
@ -1859,6 +1878,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||||
|
printf(" --chat-template FORMAT_NAME");
|
||||||
|
printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2290,6 +2311,21 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
log_set_target(stdout);
|
log_set_target(stdout);
|
||||||
LOG_INFO("logging to file is disabled.", {});
|
LOG_INFO("logging to file is disabled.", {});
|
||||||
}
|
}
|
||||||
|
else if (arg == "--chat-template")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string value(argv[i]);
|
||||||
|
if (value != "chatml" && value != "llama2") {
|
||||||
|
fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str());
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.chat_template = value;
|
||||||
|
}
|
||||||
else if (arg == "--override-kv")
|
else if (arg == "--override-kv")
|
||||||
{
|
{
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -2743,13 +2779,13 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
|
|
||||||
// TODO: add mount point without "/v1" prefix -- how?
|
// TODO: add mount point without "/v1" prefix -- how?
|
||||||
svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
if (!validate_api_key(req, res)) {
|
if (!validate_api_key(req, res)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template);
|
||||||
|
|
||||||
const int task_id = llama.queue_tasks.get_new_id();
|
const int task_id = llama.queue_tasks.get_new_id();
|
||||||
llama.queue_results.add_waiting_task_id(task_id);
|
llama.queue_results.add_waiting_task_id(task_id);
|
||||||
|
|
|
@ -167,6 +167,34 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||||
: default_value;
|
: default_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::string format_llama2(std::vector<json> messages)
|
||||||
|
{
|
||||||
|
std::ostringstream output;
|
||||||
|
bool is_inside_turn = false;
|
||||||
|
|
||||||
|
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||||
|
if (!is_inside_turn) {
|
||||||
|
output << "[INST] ";
|
||||||
|
}
|
||||||
|
std::string role = json_value(*it, "role", std::string("user"));
|
||||||
|
std::string content = json_value(*it, "content", std::string(""));
|
||||||
|
if (role == "system") {
|
||||||
|
output << "<<SYS>>\n" << content << "\n<<SYS>>\n\n";
|
||||||
|
is_inside_turn = true;
|
||||||
|
} else if (role == "user") {
|
||||||
|
output << content << " [/INST]";
|
||||||
|
is_inside_turn = true;
|
||||||
|
} else {
|
||||||
|
output << " " << content << " </s>";
|
||||||
|
is_inside_turn = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_VERBOSE("format_llama2", {{"text", output.str()}});
|
||||||
|
|
||||||
|
return output.str();
|
||||||
|
}
|
||||||
|
|
||||||
inline std::string format_chatml(std::vector<json> messages)
|
inline std::string format_chatml(std::vector<json> messages)
|
||||||
{
|
{
|
||||||
std::ostringstream chatml_msgs;
|
std::ostringstream chatml_msgs;
|
||||||
|
@ -180,6 +208,8 @@ inline std::string format_chatml(std::vector<json> messages)
|
||||||
|
|
||||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||||
|
|
||||||
|
LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}});
|
||||||
|
|
||||||
return chatml_msgs.str();
|
return chatml_msgs.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -653,6 +653,9 @@ struct ggml_backend_cpu_context {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
void * work_data;
|
void * work_data;
|
||||||
size_t work_size;
|
size_t work_size;
|
||||||
|
|
||||||
|
ggml_abort_callback abort_callback;
|
||||||
|
void * abort_callback_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
||||||
|
@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
|
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||||
|
|
||||||
return cpu_plan;
|
return cpu_plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
||||||
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
|
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
|
||||||
cpu_ctx->work_size = cplan.work_size;
|
cpu_ctx->work_size = cplan.work_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
cplan.work_data = cpu_ctx->work_data;
|
cplan.work_data = cpu_ctx->work_data;
|
||||||
|
|
||||||
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||||
|
|
||||||
ggml_graph_compute(cgraph, &cplan);
|
ggml_graph_compute(cgraph, &cplan);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -762,6 +770,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
ctx->work_data = NULL;
|
ctx->work_data = NULL;
|
||||||
ctx->work_size = 0;
|
ctx->work_size = 0;
|
||||||
|
ctx->abort_callback = NULL;
|
||||||
|
ctx->abort_callback_data = NULL;
|
||||||
|
|
||||||
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
||||||
|
|
||||||
|
@ -783,6 +793,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||||
ctx->n_threads = n_threads;
|
ctx->n_threads = n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
|
ctx->abort_callback = abort_callback;
|
||||||
|
ctx->abort_callback_data = abort_callback_data;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,6 +85,7 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// Create a backend buffer from an existing pointer
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||||
|
|
|
@ -687,6 +687,7 @@ static bool ggml_metal_graph_compute(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
|
|
||||||
|
@autoreleasepool {
|
||||||
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
|
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
|
||||||
edesc.dispatchType = MTLDispatchTypeSerial;
|
edesc.dispatchType = MTLDispatchTypeSerial;
|
||||||
|
|
||||||
|
@ -2272,6 +2273,7 @@ static bool ggml_metal_graph_compute(
|
||||||
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
351
ggml-quants.c
351
ggml-quants.c
|
@ -49,6 +49,8 @@
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
||||||
|
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||||
|
@ -268,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
||||||
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||||
|
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
|
||||||
|
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined(__aarch64__)
|
#if !defined(__aarch64__)
|
||||||
|
|
||||||
// 64-bit compatibility
|
// 64-bit compatibility
|
||||||
|
@ -3666,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_0 * restrict x = vx;
|
const block_q4_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q4_0 * restrict vx0 = vx;
|
||||||
|
const block_q4_0 * restrict vx1 = vx + bx;
|
||||||
|
|
||||||
|
const block_q8_0 * restrict vy0 = vy;
|
||||||
|
const block_q8_0 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q4_0 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q4_0 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_0 * restrict b_y0 = &vy0[i];
|
||||||
|
const block_q8_0 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
const int8x16_t s8b = vdupq_n_s8(0x8);
|
||||||
|
|
||||||
|
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
||||||
|
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
||||||
|
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
||||||
|
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
||||||
|
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
||||||
|
|
||||||
|
// sub 8
|
||||||
|
const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
|
||||||
|
const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
|
||||||
|
const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
|
||||||
|
const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
@ -3956,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_1;
|
const int qk = QK8_1;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_1 * restrict x = vx;
|
const block_q4_1 * restrict x = vx;
|
||||||
const block_q8_1 * restrict y = vy;
|
const block_q8_1 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q4_1 * restrict vx0 = vx;
|
||||||
|
const block_q4_1 * restrict vx1 = vx + bx;
|
||||||
|
const block_q8_1 * restrict vy0 = vy;
|
||||||
|
const block_q8_1 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q4_1 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q4_1 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
||||||
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
|
||||||
|
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
|
||||||
|
summs0 += summs_t;
|
||||||
|
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
|
||||||
|
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
||||||
|
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
||||||
|
const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
||||||
|
const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
||||||
|
const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
// mmla into int32x4_t
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
sumv2 = sumv2 + summs0;
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// TODO: add WASM SIMD
|
// TODO: add WASM SIMD
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
@ -4096,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
assert(qk == QK5_0);
|
assert(qk == QK5_0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_0 * restrict x = vx;
|
const block_q5_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
@ -4382,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_1;
|
const int qk = QK8_1;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
assert(qk == QK5_1);
|
assert(qk == QK5_1);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_1 * restrict x = vx;
|
const block_q5_1 * restrict x = vx;
|
||||||
const block_q8_1 * restrict y = vy;
|
const block_q8_1 * restrict y = vy;
|
||||||
|
@ -4681,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
assert((nrc == 2) || (nrc == 1));
|
||||||
|
#else
|
||||||
|
assert(nrc == 1);
|
||||||
|
#endif
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q8_0 * restrict x = vx;
|
const block_q8_0 * restrict x = vx;
|
||||||
const block_q8_0 * restrict y = vy;
|
const block_q8_0 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
if (nrc == 2) {
|
||||||
|
const block_q8_0 * restrict vx0 = vx;
|
||||||
|
const block_q8_0 * restrict vx1 = vx + bx;
|
||||||
|
const block_q8_0 * restrict vy0 = vy;
|
||||||
|
const block_q8_0 * restrict vy1 = vy + by;
|
||||||
|
|
||||||
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
const block_q8_0 * restrict b_x0 = &vx0[i];
|
||||||
|
const block_q8_0 * restrict b_y0 = &vy0[i];
|
||||||
|
|
||||||
|
const block_q8_0 * restrict b_x1 = &vx1[i];
|
||||||
|
const block_q8_0 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
|
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
|
||||||
|
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
|
||||||
|
const int8x16_t x1_l = vld1q_s8(b_x1->qs);
|
||||||
|
const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
||||||
|
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
||||||
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
|
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
||||||
|
|
||||||
|
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
||||||
|
|
||||||
|
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
||||||
|
|
||||||
|
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
||||||
|
l1, r1)), l2, r2)), l3, r3))), scale);
|
||||||
|
}
|
||||||
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
|
|
||||||
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
@ -4784,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q2_K * restrict x = vx;
|
const block_q2_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -5160,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q2_K * restrict x = vx;
|
const block_q2_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -5418,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const uint32_t kmask1 = 0x03030303;
|
const uint32_t kmask1 = 0x03030303;
|
||||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
@ -5938,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q3_K * restrict x = vx;
|
const block_q3_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6281,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_K * restrict x = vx;
|
const block_q4_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6637,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q4_K * restrict x = vx;
|
const block_q4_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -6880,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_K * restrict x = vx;
|
const block_q5_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -7300,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q5_K * restrict x = vx;
|
const block_q5_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -7566,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q6_K * restrict x = vx;
|
const block_q6_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -7998,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_q6_K * restrict x = vx;
|
const block_q6_K * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8328,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
|
||||||
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||||
};
|
};
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq2_xxs * restrict x = vx;
|
const block_iq2_xxs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8451,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq2_xs * restrict x = vx;
|
const block_iq2_xs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8671,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
const block_iq3_xxs * restrict x = vx;
|
const block_iq3_xxs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -8698,10 +9005,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
|
||||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||||
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
||||||
const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
|
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
|
||||||
const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
|
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
|
||||||
const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
|
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
|
||||||
const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
|
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
|
||||||
q3 += 16;
|
q3 += 16;
|
||||||
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
||||||
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
||||||
|
|
|
@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
|
|
110
ggml-vulkan.cpp
110
ggml-vulkan.cpp
|
@ -27,6 +27,7 @@
|
||||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||||
|
|
||||||
#define VK_VENDOR_ID_AMD 0x1002
|
#define VK_VENDOR_ID_AMD 0x1002
|
||||||
|
#define VK_VENDOR_ID_APPLE 0x106b
|
||||||
#define VK_VENDOR_ID_INTEL 0x8086
|
#define VK_VENDOR_ID_INTEL 0x8086
|
||||||
#define VK_VENDOR_ID_NVIDIA 0x10de
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
||||||
|
|
||||||
|
@ -744,6 +745,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
|
||||||
}
|
}
|
||||||
|
|
||||||
if (memory_type_index >= mem_props.memoryTypeCount) {
|
if (memory_type_index >= mem_props.memoryTypeCount) {
|
||||||
|
ctx->device.lock()->device.destroyBuffer(buf->buffer);
|
||||||
|
buf->size = 0;
|
||||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2032,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
|
||||||
return ctx->pipeline_matmul_f32_aligned_l.align;
|
return ctx->pipeline_matmul_f32_aligned_l.align;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
|
||||||
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
|
||||||
#endif
|
|
||||||
if (bit16_x && bit16_y) {
|
if (bit16_x && bit16_y) {
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
|
}
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
|
}
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " M" << std::endl;
|
||||||
|
#endif
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
|
}
|
||||||
|
if (bit16_x && !bit16_y) {
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
|
}
|
||||||
|
if (!bit16_x && bit16_y) {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
||||||
|
#endif
|
||||||
|
switch (ctx->device.lock()->vendor_id) {
|
||||||
|
case VK_VENDOR_ID_AMD:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
|
||||||
|
case VK_VENDOR_ID_APPLE:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
|
||||||
|
case VK_VENDOR_ID_INTEL:
|
||||||
|
return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bit16_x && bit16_y) {
|
||||||
|
if (m <= 32 || n <= 32) {
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << " S" << std::endl;
|
||||||
|
#endif
|
||||||
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
||||||
|
}
|
||||||
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -2055,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
||||||
}
|
}
|
||||||
if (bit16_x && !bit16_y) {
|
if (bit16_x && !bit16_y) {
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -2076,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " S" << std::endl;
|
std::cerr << " S" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
||||||
}
|
}
|
||||||
if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
|
if (m <= 64 || n <= 64) {
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << " M" << std::endl;
|
std::cerr << " M" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
@ -3875,7 +3960,7 @@ static ggml_tensor * ggml_vk_find_last_use(const ggml_tensor * node, ggml_cgraph
|
||||||
|
|
||||||
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_ctx->preallocate_buffers_graph(" << node << ")" << std::endl;
|
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|
const bool any_on_device = node->backend == GGML_BACKEND_GPU
|
||||||
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||||
|
@ -3994,8 +4079,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#ifdef GGML_VULKAN_DEBUG
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
std::cerr << "ggml_ctx->preallocate_buffers()" << std::endl;
|
std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
||||||
std::cerr << "qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << std::endl;
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_VULKAN_RUN_TESTS)
|
#if defined(GGML_VULKAN_RUN_TESTS)
|
||||||
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||||
|
|
166
ggml.c
166
ggml.c
|
@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
|
||||||
|
|
||||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||||
|
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||||
|
|
||||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_I8] = {
|
[GGML_TYPE_I8] = {
|
||||||
|
@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
||||||
.vec_dot_type = GGML_TYPE_F32,
|
.vec_dot_type = GGML_TYPE_F32,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_F16] = {
|
[GGML_TYPE_F16] = {
|
||||||
.type_name = "f16",
|
.type_name = "f16",
|
||||||
|
@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||||
.vec_dot_type = GGML_TYPE_F16,
|
.vec_dot_type = GGML_TYPE_F16,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_0] = {
|
[GGML_TYPE_Q4_0] = {
|
||||||
.type_name = "q4_0",
|
.type_name = "q4_0",
|
||||||
|
@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_1] = {
|
[GGML_TYPE_Q4_1] = {
|
||||||
.type_name = "q4_1",
|
.type_name = "q4_1",
|
||||||
|
@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[4] = { // GGML_TYPE_Q4_2
|
[4] = { // GGML_TYPE_Q4_2
|
||||||
.type_name = "DEPRECATED",
|
.type_name = "DEPRECATED",
|
||||||
|
@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
.vec_dot_type = GGML_TYPE_COUNT,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[5] = { // GGML_TYPE_Q4_3
|
[5] = { // GGML_TYPE_Q4_3
|
||||||
.type_name = "DEPRECATED",
|
.type_name = "DEPRECATED",
|
||||||
|
@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = NULL,
|
.vec_dot = NULL,
|
||||||
.vec_dot_type = GGML_TYPE_COUNT,
|
.vec_dot_type = GGML_TYPE_COUNT,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_0] = {
|
[GGML_TYPE_Q5_0] = {
|
||||||
.type_name = "q5_0",
|
.type_name = "q5_0",
|
||||||
|
@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_1] = {
|
[GGML_TYPE_Q5_1] = {
|
||||||
.type_name = "q5_1",
|
.type_name = "q5_1",
|
||||||
|
@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_0] = {
|
[GGML_TYPE_Q8_0] = {
|
||||||
.type_name = "q8_0",
|
.type_name = "q8_0",
|
||||||
|
@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
||||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
.nrows = 2,
|
||||||
|
#else
|
||||||
|
.nrows = 1,
|
||||||
|
#endif
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_1] = {
|
[GGML_TYPE_Q8_1] = {
|
||||||
.type_name = "q8_1",
|
.type_name = "q8_1",
|
||||||
|
@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float = quantize_row_q8_1,
|
.from_float = quantize_row_q8_1,
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q2_K] = {
|
[GGML_TYPE_Q2_K] = {
|
||||||
.type_name = "q2_K",
|
.type_name = "q2_K",
|
||||||
|
@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q3_K] = {
|
[GGML_TYPE_Q3_K] = {
|
||||||
.type_name = "q3_K",
|
.type_name = "q3_K",
|
||||||
|
@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q4_K] = {
|
[GGML_TYPE_Q4_K] = {
|
||||||
.type_name = "q4_K",
|
.type_name = "q4_K",
|
||||||
|
@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q5_K] = {
|
[GGML_TYPE_Q5_K] = {
|
||||||
.type_name = "q5_K",
|
.type_name = "q5_K",
|
||||||
|
@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q6_K] = {
|
[GGML_TYPE_Q6_K] = {
|
||||||
.type_name = "q6_K",
|
.type_name = "q6_K",
|
||||||
|
@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
||||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_XXS] = {
|
[GGML_TYPE_IQ2_XXS] = {
|
||||||
.type_name = "iq2_xxs",
|
.type_name = "iq2_xxs",
|
||||||
|
@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ2_XS] = {
|
[GGML_TYPE_IQ2_XS] = {
|
||||||
.type_name = "iq2_xs",
|
.type_name = "iq2_xs",
|
||||||
|
@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = NULL,
|
.from_float_reference = NULL,
|
||||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ3_XXS] = {
|
[GGML_TYPE_IQ3_XXS] = {
|
||||||
.type_name = "iq3_xxs",
|
.type_name = "iq3_xxs",
|
||||||
|
@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
||||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_K] = {
|
[GGML_TYPE_Q8_K] = {
|
||||||
.type_name = "q8_K",
|
.type_name = "q8_K",
|
||||||
|
@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
|
||||||
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
||||||
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
||||||
|
|
||||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
#ifdef GGML_SIMD
|
#ifdef GGML_SIMD
|
||||||
float sumf = 0.0f;
|
float sumf = 0.0f;
|
||||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||||
|
@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
ggml_float sumf = 0.0;
|
ggml_float sumf = 0.0;
|
||||||
|
|
||||||
#if defined(GGML_SIMD)
|
#if defined(GGML_SIMD)
|
||||||
|
@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
|
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
||||||
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
||||||
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
||||||
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
||||||
|
@ -9992,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||||
|
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||||
|
|
||||||
GGML_ASSERT(ne0 == ne01);
|
GGML_ASSERT(ne0 == ne01);
|
||||||
GGML_ASSERT(ne1 == ne11);
|
GGML_ASSERT(ne1 == ne11);
|
||||||
|
@ -10159,12 +10202,23 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const int64_t blck_0 = 16;
|
const int64_t blck_0 = 16;
|
||||||
const int64_t blck_1 = 16;
|
const int64_t blck_1 = 16;
|
||||||
|
|
||||||
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
||||||
|
int64_t nrc = vec_dot_num_rows;
|
||||||
|
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
||||||
|
// this check can be removed once they are extended to support odd numbered rows/cols too
|
||||||
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
||||||
|
nrc = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
||||||
|
|
||||||
// attempt to reduce false-sharing (does not seem to make a difference)
|
// attempt to reduce false-sharing (does not seem to make a difference)
|
||||||
float tmp[16];
|
// 16 * 2, accounting for mmla kernels
|
||||||
|
float tmp[32];
|
||||||
|
|
||||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
|
||||||
const int64_t i13 = (ir1/(ne12*ne1));
|
const int64_t i13 = (ir1/(ne12*ne1));
|
||||||
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
||||||
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
||||||
|
@ -10187,17 +10241,19 @@ static void ggml_compute_forward_mul_mat(
|
||||||
(src1_cont || src1->type != vec_dot_type
|
(src1_cont || src1->type != vec_dot_type
|
||||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||||
|
|
||||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||||
|
|
||||||
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||||
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
|
||||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int cn = 0; cn < nrc; ++cn) {
|
||||||
|
memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||||
}
|
}
|
||||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10386,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id(
|
||||||
//}
|
//}
|
||||||
|
|
||||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
|
||||||
}
|
}
|
||||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||||
}
|
}
|
||||||
|
@ -11568,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
||||||
|
|
||||||
// linear runtime, no additional memory
|
// linear runtime, no additional memory
|
||||||
float dot_y_dy = 0;
|
float dot_y_dy = 0;
|
||||||
ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
|
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
|
||||||
ggml_vec_cpy_f32 (nc, dx, dy);
|
ggml_vec_cpy_f32 (nc, dx, dy);
|
||||||
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
||||||
ggml_vec_mul_f32 (nc, dx, dx, y);
|
ggml_vec_mul_f32 (nc, dx, dx, y);
|
||||||
|
@ -12369,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
||||||
const int i1n = i10*ne11;
|
const int i1n = i10*ne11;
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f16(ne02, &v,
|
ggml_vec_dot_f16(ne02, &v, 0,
|
||||||
(ggml_fp16_t *) wdata_src + i1n,
|
(ggml_fp16_t *) wdata_src + i1n, 0,
|
||||||
(ggml_fp16_t *) wdata_kernel + i00*ne02);
|
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
|
||||||
dst_data[i10*s0 + i00] += v;
|
dst_data[i10*s0 + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12466,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
||||||
const int i1n = i10*ne11;
|
const int i1n = i10*ne11;
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f32(ne02, &v,
|
ggml_vec_dot_f32(ne02, &v, 0,
|
||||||
wdata_src + i1n,
|
wdata_src + i1n, 0,
|
||||||
wdata_kernel + i00*ne02);
|
wdata_kernel + i00*ne02, 0, 1);
|
||||||
dst_data[i10*s0 + i00] += v;
|
dst_data[i10*s0 + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12783,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
||||||
for (int i01 = 0; i01 < ne01; i01++) {
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
for (int i00 = 0; i00 < ne00; i00++) {
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
float v = 0;
|
float v = 0;
|
||||||
ggml_vec_dot_f16(ne03, &v,
|
ggml_vec_dot_f16(ne03, &v, 0,
|
||||||
wdata_src + i1n,
|
wdata_src + i1n, 0,
|
||||||
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||||
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13214,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f32(neq0,
|
ggml_vec_dot_f32(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// scale
|
// scale
|
||||||
|
@ -13299,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
||||||
const int iv3 = iq3;
|
const int iv3 = iq3;
|
||||||
|
|
||||||
ggml_vec_dot_f32(masked_begin,
|
ggml_vec_dot_f32(masked_begin,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||||
S);
|
S, 0, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13404,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f16(neq0,
|
ggml_vec_dot_f16(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
||||||
|
@ -13508,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
||||||
const int iv3 = iq3;
|
const int iv3 = iq3;
|
||||||
|
|
||||||
ggml_vec_dot_f16(nev0,
|
ggml_vec_dot_f16(nev0,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||||
S16);
|
S16, 0, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
||||||
|
@ -13652,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
||||||
const int i1 = ib01;
|
const int i1 = ib01;
|
||||||
|
|
||||||
ggml_vec_dot_f16(nea0,
|
ggml_vec_dot_f16(nea0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
|
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
||||||
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
|
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
||||||
|
@ -13677,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
||||||
for (int64_t ic = 0; ic < nec01; ++ic) {
|
for (int64_t ic = 0; ic < nec01; ++ic) {
|
||||||
|
|
||||||
ggml_vec_dot_f16(neb01,
|
ggml_vec_dot_f16(neb01,
|
||||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||||
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
|
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
||||||
S16);
|
S16, 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_add_f32(nec01,
|
ggml_vec_add_f32(nec01,
|
||||||
|
@ -13866,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
||||||
const int i1 = ik1;
|
const int i1 = ik1;
|
||||||
|
|
||||||
ggml_vec_dot_f32(neq0,
|
ggml_vec_dot_f32(neq0,
|
||||||
S + i1,
|
S + i1, 0,
|
||||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// scale
|
// scale
|
||||||
|
@ -14013,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
||||||
|
|
||||||
// S = SM * (S - dot(SM, S))
|
// S = SM * (S - dot(SM, S))
|
||||||
float dot_SM_gradSM = 0;
|
float dot_SM_gradSM = 0;
|
||||||
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
|
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
|
||||||
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
||||||
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
||||||
|
|
||||||
|
@ -16649,7 +16705,7 @@ struct ggml_compute_state_shared {
|
||||||
atomic_int node_n; // active graph node
|
atomic_int node_n; // active graph node
|
||||||
atomic_int node_task; // active graph node task phase
|
atomic_int node_task; // active graph node task phase
|
||||||
|
|
||||||
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -18382,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute the initial gradient in the search direction
|
// compute the initial gradient in the search direction
|
||||||
ggml_vec_dot_f32(nx, &dginit, g, d);
|
ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
|
||||||
|
|
||||||
// make sure that d points to a descent direction
|
// make sure that d points to a descent direction
|
||||||
if (0 < dginit) {
|
if (0 < dginit) {
|
||||||
|
@ -18432,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vec_dot_f32(nx, &dg, g, d);
|
ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
|
||||||
|
|
||||||
// check the Wolfe condition
|
// check the Wolfe condition
|
||||||
if (dg < params->lbfgs.wolfe * dginit) {
|
if (dg < params->lbfgs.wolfe * dginit) {
|
||||||
|
@ -18693,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
// ys = y^t \cdot s -> 1 / \rho.
|
// ys = y^t \cdot s -> 1 / \rho.
|
||||||
// yy = y^t \cdot y.
|
// yy = y^t \cdot y.
|
||||||
//
|
//
|
||||||
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
|
||||||
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
|
||||||
|
|
||||||
lm_ys[end[0]] = ys;
|
lm_ys[end[0]] = ys;
|
||||||
|
|
||||||
|
@ -18713,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
for (int i = 0; i < bound; ++i) {
|
for (int i = 0; i < bound; ++i) {
|
||||||
j[0] = (j[0] + m - 1) % m;
|
j[0] = (j[0] + m - 1) % m;
|
||||||
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
|
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
|
||||||
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
|
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
|
||||||
lm_alpha[j[0]] /= lm_ys[j[0]];
|
lm_alpha[j[0]] /= lm_ys[j[0]];
|
||||||
// q_{i} = q_{i+1} - \alpha_{i} y_{i}
|
// q_{i} = q_{i+1} - \alpha_{i} y_{i}
|
||||||
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
|
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
|
||||||
|
@ -18723,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
|
|
||||||
for (int i = 0; i < bound; ++i) {
|
for (int i = 0; i < bound; ++i) {
|
||||||
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
|
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
|
||||||
ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
|
ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
|
||||||
beta /= lm_ys[j[0]];
|
beta /= lm_ys[j[0]];
|
||||||
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
|
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
|
||||||
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
|
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
|
||||||
|
@ -20611,4 +20667,12 @@ int ggml_cpu_has_vsx(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_matmul_int8(void) {
|
||||||
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
12
ggml.h
12
ggml.h
|
@ -567,6 +567,11 @@ extern "C" {
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
|
||||||
|
// Abort callback
|
||||||
|
// If not NULL, called before ggml computation
|
||||||
|
// If it returns true, the computation is aborted
|
||||||
|
typedef bool (*ggml_abort_callback)(void * data);
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
// since https://github.com/ggerganov/ggml/issues/287
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
struct ggml_cplan {
|
struct ggml_cplan {
|
||||||
|
@ -576,7 +581,7 @@ extern "C" {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
|
||||||
// abort ggml_graph_compute when true
|
// abort ggml_graph_compute when true
|
||||||
bool (*abort_callback)(void * data);
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2273,6 +2278,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||||
GGML_API int ggml_cpu_has_sycl (void);
|
GGML_API int ggml_cpu_has_sycl (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
@ -2286,7 +2292,8 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||||
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||||
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const char * type_name;
|
const char * type_name;
|
||||||
|
@ -2298,6 +2305,7 @@ extern "C" {
|
||||||
ggml_from_float_t from_float_reference;
|
ggml_from_float_t from_float_reference;
|
||||||
ggml_vec_dot_t vec_dot;
|
ggml_vec_dot_t vec_dot;
|
||||||
enum ggml_type vec_dot_type;
|
enum ggml_type vec_dot_type;
|
||||||
|
int64_t nrows; // number of rows to process simultaneously;
|
||||||
} ggml_type_traits_t;
|
} ggml_type_traits_t;
|
||||||
|
|
||||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||||
|
|
|
@ -2067,6 +2067,8 @@ type_names = {
|
||||||
|
|
||||||
K_QUANTS_PER_ITERATION = 2
|
K_QUANTS_PER_ITERATION = 2
|
||||||
|
|
||||||
|
ASYNCIO_CONCURRENCY = 64
|
||||||
|
|
||||||
output_dir = gettempdir()
|
output_dir = gettempdir()
|
||||||
|
|
||||||
lock = asyncio.Lock()
|
lock = asyncio.Lock()
|
||||||
|
@ -2291,7 +2293,14 @@ async def main():
|
||||||
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||||
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
# Helper to decorate tasks with semaphore acquisition.
|
||||||
|
async def withSemaphore(sem, task):
|
||||||
|
async with sem:
|
||||||
|
return await task
|
||||||
|
|
||||||
|
# Run tasks concurrently guarded by a concurrency limit.
|
||||||
|
sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
|
||||||
|
await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
|
||||||
|
|
||||||
with open("ggml-vulkan-shaders.hpp", "w") as f:
|
with open("ggml-vulkan-shaders.hpp", "w") as f:
|
||||||
f.write("#include <cstdint>\n\n")
|
f.write("#include <cstdint>\n\n")
|
||||||
|
|
|
@ -7285,7 +7285,9 @@ static int llama_decode_internal(
|
||||||
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
||||||
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
||||||
// with the BLAS calls. need a better solution
|
// with the BLAS calls. need a better solution
|
||||||
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
||||||
|
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
||||||
|
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
||||||
n_threads = std::min(4, n_threads);
|
n_threads = std::min(4, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11867,6 +11869,7 @@ const char * llama_print_system_info(void) {
|
||||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||||
|
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
|
@ -156,8 +156,8 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
float fs;
|
float fs;
|
||||||
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data());
|
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
|
||||||
else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data());
|
else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||||
if (iloop > 3) ggml.addResult(fs, t);
|
if (iloop > 3) ggml.addResult(fs, t);
|
||||||
|
|
|
@ -284,8 +284,8 @@ int main(int argc, char** argv) {
|
||||||
else {
|
else {
|
||||||
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
|
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
|
||||||
vdot.from_float(y1.data(), q8.data(), kVecSize);
|
vdot.from_float(y1.data(), q8.data(), kVecSize);
|
||||||
if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data());
|
if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||||
else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data());
|
else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||||
}
|
}
|
||||||
sumq += result;
|
sumq += result;
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
|
@ -97,6 +97,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
# src/ggml-cuda.cu -> ggml-cuda.cu
|
# src/ggml-cuda.cu -> ggml-cuda.cu
|
||||||
# src/ggml-cuda.h -> ggml-cuda.h
|
# src/ggml-cuda.h -> ggml-cuda.h
|
||||||
# src/ggml-impl.h -> ggml-impl.h
|
# src/ggml-impl.h -> ggml-impl.h
|
||||||
|
# src/ggml-kompute.cpp -> ggml-kompute.cpp
|
||||||
|
# src/ggml-kompute.h -> ggml-kompute.h
|
||||||
# src/ggml-metal.h -> ggml-metal.h
|
# src/ggml-metal.h -> ggml-metal.h
|
||||||
# src/ggml-metal.m -> ggml-metal.m
|
# src/ggml-metal.m -> ggml-metal.m
|
||||||
# src/ggml-mpi.h -> ggml-mpi.h
|
# src/ggml-mpi.h -> ggml-mpi.h
|
||||||
|
@ -105,6 +107,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
# src/ggml-opencl.h -> ggml-opencl.h
|
# src/ggml-opencl.h -> ggml-opencl.h
|
||||||
# src/ggml-quants.c -> ggml-quants.c
|
# src/ggml-quants.c -> ggml-quants.c
|
||||||
# src/ggml-quants.h -> ggml-quants.h
|
# src/ggml-quants.h -> ggml-quants.h
|
||||||
|
# src/ggml-sycl.cpp -> ggml-sycl.cpp
|
||||||
|
# src/ggml-sycl.h -> ggml-sycl.h
|
||||||
|
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
|
||||||
|
# src/ggml-vulkan.h -> ggml-vulkan.h
|
||||||
# include/ggml/ggml.h -> ggml.h
|
# include/ggml/ggml.h -> ggml.h
|
||||||
# include/ggml/ggml-alloc.h -> ggml-alloc.h
|
# include/ggml/ggml-alloc.h -> ggml-alloc.h
|
||||||
# include/ggml/ggml-backend.h -> ggml-backend.h
|
# include/ggml/ggml-backend.h -> ggml-backend.h
|
||||||
|
@ -123,6 +129,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
|
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
|
||||||
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
|
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
|
||||||
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
|
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
|
||||||
|
-e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
|
||||||
|
-e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
|
||||||
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
|
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
|
||||||
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
|
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
|
||||||
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
|
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
|
||||||
|
@ -131,6 +139,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||||
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||||
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
||||||
|
-e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
|
||||||
|
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
|
||||||
|
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
|
||||||
|
-e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
|
||||||
-e 's/include\/ggml\/ggml\.h/ggml.h/g' \
|
-e 's/include\/ggml\/ggml\.h/ggml.h/g' \
|
||||||
-e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
|
-e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
|
||||||
-e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
|
-e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
475cbad5c1c834e31e26a2283bc1413181644360
|
2c7cf49810d523b9632da393a9e8270b60bf3b24
|
||||||
|
|
|
@ -7,6 +7,8 @@ cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c
|
||||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
||||||
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
||||||
cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h
|
cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h
|
||||||
|
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp
|
||||||
|
cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h
|
||||||
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
||||||
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
||||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
||||||
|
@ -16,6 +18,10 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
||||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
|
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
|
||||||
|
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp
|
||||||
|
cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h
|
||||||
|
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp
|
||||||
|
cp -rpv ../ggml/src/ggml-vulkan.h ./ggml-vulkan.h
|
||||||
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
||||||
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
|
||||||
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
|
||||||
|
|
|
@ -87,7 +87,7 @@ static float dot_product_error(
|
||||||
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
||||||
|
|
||||||
float result = INFINITY;
|
float result = INFINITY;
|
||||||
qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());
|
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||||
|
|
||||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||||
|
|
||||||
|
|
|
@ -346,7 +346,7 @@ int main(int argc, char * argv[]) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void) -> float {
|
auto quantize_fn = [&](void) -> float {
|
||||||
float result;
|
float result;
|
||||||
qfns.vec_dot(size, &result, test_q1, test_q2);
|
qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
size_t quantized_size = ggml_row_size(type, size);
|
size_t quantized_size = ggml_row_size(type, size);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue