Merge branch 'master' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	Makefile
#	README.md
#	tests/test-quantize-perf.cpp
This commit is contained in:
Concedo 2023-06-27 19:15:27 +08:00
commit 282376c85a
21 changed files with 2853 additions and 482 deletions

View file

@ -566,8 +566,8 @@ struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1] // Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1] // Kcur shape [n_embd/n_head, n_head, N, 1]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
// store key and value to memory // store key and value to memory
{ {
@ -823,8 +823,8 @@ struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch] // Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch] // Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@ -1116,7 +1116,7 @@ struct ggml_tensor * forward_lora(
model->layers[il].wqb, model->layers[il].wqb,
cur)), cur)),
n_embd/n_head, n_head, N), n_embd/n_head, n_head, N),
n_past, n_rot, 0); n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, struct ggml_tensor * Kcur = ggml_rope(ctx0,
ggml_reshape_3d(ctx0, ggml_reshape_3d(ctx0,
ggml_mul_mat(ctx0, ggml_mul_mat(ctx0,
@ -1125,7 +1125,7 @@ struct ggml_tensor * forward_lora(
model->layers[il].wkb, model->layers[il].wkb,
cur)), cur)),
n_embd/n_head, n_head, N), n_embd/n_head, n_head, N),
n_past, n_rot, 0); n_past, n_rot, 0, 0);
// store key and value to memory // store key and value to memory
{ {

View file

@ -343,6 +343,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--mtest") { } else if (arg == "--mtest") {
params.mem_test = true; params.mem_test = true;
} else if (arg == "--numa") {
params.numa = true;
} else if (arg == "--export") { } else if (arg == "--export") {
params.export_cgraph = true; params.export_cgraph = true;
} else if (arg == "--verbose-prompt") { } else if (arg == "--verbose-prompt") {
@ -488,6 +490,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_mmap_supported()) { if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n"); fprintf(stderr, " number of layers to store in VRAM\n");

View file

@ -76,6 +76,7 @@ struct gpt_params {
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage bool mem_test = false; // compute maximum memory usage
bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
}; };

View file

@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_init_backend(); llama_init_backend(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -262,6 +262,10 @@ These options help improve the performance and memory usage of the LLaMA models.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all. - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
### NUMA support
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
### Memory Float 32 ### Memory Float 32
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.

View file

@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_init_backend(); llama_init_backend(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_init_backend(); llama_init_backend(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
usage(argv[0]); usage(argv[0]);
} }
llama_init_backend(); llama_init_backend(false);
// parse command line arguments // parse command line arguments
const std::string fname_inp = argv[arg_idx]; const std::string fname_inp = argv[arg_idx];

View file

@ -789,7 +789,7 @@ int main(int argc, char ** argv) {
params.model_alias = params.model; params.model_alias = params.model;
} }
llama_init_backend(); llama_init_backend(params.numa);
LOG_INFO("build info", { LOG_INFO("build info", {
{ "build", BUILD_NUMBER }, { "build", BUILD_NUMBER },

View file

@ -66,7 +66,7 @@ int main(int argc, char ** argv)
// Init LLM : // Init LLM :
//--------------------------------- //---------------------------------
llama_init_backend(); llama_init_backend(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -294,20 +294,9 @@ void init_model(struct my_llama_model * model) {
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str()); ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
// 'layers.10.feed_forward.w1.weight' has length of 32. ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
// ggml_tensor->name only has 32 characters, but we need one more for the '\0' terminator. ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
// ggml_set_name will set the last character to '\0', so we can only store 'layers.10.feed_forward.w1.weigh'. ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
// when saving llama compatible model the tensors names will miss a character.
// ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
// ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
// ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
strncpy(layer.w1->name, (layers_i + ".feed_forward.w1.weight").c_str(), sizeof(layer.w1->name));
strncpy(layer.w2->name, (layers_i + ".feed_forward.w2.weight").c_str(), sizeof(layer.w2->name));
strncpy(layer.w3->name, (layers_i + ".feed_forward.w3.weight").c_str(), sizeof(layer.w3->name));
layer.w1->padding[0] = 0;
layer.w2->padding[0] = 0;
layer.w3->padding[0] = 0;
} }
} }
@ -454,8 +443,8 @@ struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1] // Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1] // Kcur shape [n_embd/n_head, n_head, N, 1]
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
// store key and value to memory // store key and value to memory
{ {
@ -711,8 +700,8 @@ struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch] // Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch] // Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@ -996,8 +985,8 @@ struct ggml_tensor * forward_batch_wo_cache(
// wk shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch] // Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch] // Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@ -1218,8 +1207,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
// compute Q and K and RoPE them // compute Q and K and RoPE them
// wq shape [n_embd, n_embd, 1, 1] // wq shape [n_embd, n_embd, 1, 1]
// wk shape [n_embd, n_embd, 1, 1] // wk shape [n_embd, n_embd, 1, 1]
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@ -1618,10 +1607,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd);
use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@ -2368,7 +2357,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
file->write_u32(0); file->write_u32(0);
file->write_u32(0); file->write_u32(0);
file->write_u32(GGML_TYPE_F32); file->write_u32(GGML_TYPE_F32);
file->seek(0-file->tell() & 31, SEEK_CUR); file->seek((0-file->tell()) & 31, SEEK_CUR);
return; return;
} }
const char * name = ggml_get_name(tensor); const char * name = ggml_get_name(tensor);
@ -2383,7 +2372,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
file->write_u32(tensor->type); file->write_u32(tensor->type);
file->write_raw(ne, sizeof(ne[0]) * nd); file->write_raw(ne, sizeof(ne[0]) * nd);
file->write_raw(name, name_len); file->write_raw(name, name_len);
file->seek(0-file->tell() & 31, SEEK_CUR); file->seek((0-file->tell()) & 31, SEEK_CUR);
file->write_raw(tensor->data, ggml_nbytes(tensor)); file->write_raw(tensor->data, ggml_nbytes(tensor));
} }
@ -2404,7 +2393,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
std::string name = file->read_string(name_len); std::string name = file->read_string(name_len);
GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0); GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
file->seek(0-file->tell() & 31, SEEK_CUR); file->seek((0-file->tell()) & 31, SEEK_CUR);
file->read_raw(tensor->data, ggml_nbytes(tensor)); file->read_raw(tensor->data, ggml_nbytes(tensor));
} }

View file

@ -117,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
//================================= k-quants //================================= k-quants
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256 #define QK_K 256
#define K_SCALE_SIZE 12
#endif
typedef struct { typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
@ -128,13 +134,25 @@ typedef struct {
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
typedef struct { typedef struct {
uint8_t hmask[QK_K/8]; uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // nibbles / quants uint8_t qs[QK_K/4]; // quants - low 2 bits
uint8_t scales[3*QK_K/64]; #ifdef GGML_QKK_64
half d; uint8_t scales[2]; // scales, quantized with 8 bits
#else
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
half d; // super-block scale
} block_q3_K; } block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
#ifdef GGML_QKK_64
typedef struct {
half d[2]; // super-block scales/mins
uint8_t scales[2]; // 4-bit block scales/mins
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct { typedef struct {
half d; // super-block scale for quantized scales half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins half dmin; // super-block scale for quantized mins
@ -142,15 +160,26 @@ typedef struct {
uint8_t qs[QK_K/2]; // 4--bit quants uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K; } block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
#endif
#ifdef GGML_QKK_64
typedef struct { typedef struct {
half d; // super-block scale for quantized scales half d; // super-block scale
half dmin; // super-block scale for quantized mins int8_t scales[QK_K/16]; // block scales
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K;
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K; } block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif
typedef struct { typedef struct {
uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t ql[QK_K/2]; // quants, lower 4 bits
@ -349,13 +378,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
const int i = blockIdx.x; const int i = blockIdx.x;
const block_q2_K * x = (const block_q2_K *) vx;
const int tid = threadIdx.x; const int tid = threadIdx.x;
#if QK_K == 256
const int n = tid/32; const int n = tid/32;
const int l = tid - 32*n; const int l = tid - 32*n;
const int is = 8*n + l/16; const int is = 8*n + l/16;
const block_q2_K * x = (const block_q2_K *) vx;
const uint8_t q = x[i].qs[32*n + l]; const uint8_t q = x[i].qs[32*n + l];
float * y = yy + i*QK_K + 128*n; float * y = yy + i*QK_K + 128*n;
@ -365,21 +395,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
#else
const int is = tid/16; // 0 or 1
const int il = tid%16; // 0...15
const uint8_t q = x[i].qs[il] >> (2*is);
float * y = yy + i*QK_K + 16*is + il;
float dall = x[i].d;
float dmin = x[i].dmin;
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
#endif
} }
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
int r = threadIdx.x/4; const int i = blockIdx.x;
int i = blockIdx.x;
int tid = r/2;
int is0 = r%2;
int l0 = 16*is0 + 4*(threadIdx.x%4);
int n = tid / 4;
int j = tid - 4*n;
const block_q3_K * x = (const block_q3_K *) vx; const block_q3_K * x = (const block_q3_K *) vx;
#if QK_K == 256
const int r = threadIdx.x/4;
const int tid = r/2;
const int is0 = r%2;
const int l0 = 16*is0 + 4*(threadIdx.x%4);
const int n = tid / 4;
const int j = tid - 4*n;
uint8_t m = 1 << (4*n + j); uint8_t m = 1 << (4*n + j);
int is = 8*n + 2*j + is0; int is = 8*n + 2*j + is0;
int shift = 2*j; int shift = 2*j;
@ -396,9 +437,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
const uint8_t * hm = x[i].hmask; const uint8_t * hm = x[i].hmask;
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
#else
const int tid = threadIdx.x;
const int is = tid/16; // 0 or 1
const int il = tid%16; // 0...15
const int im = il/8; // 0...1
const int in = il%8; // 0...7
float * y = yy + i*QK_K + 16*is + il;
const uint8_t q = x[i].qs[il] >> (2*is);
const uint8_t h = x[i].hmask[in] >> (2*is + im);
const float d = (float)x[i].d;
if (is == 0) {
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
} else {
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
}
#endif
} }
#if QK_K == 256
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
if (j < 4) { if (j < 4) {
d = q[j] & 63; m = q[j + 4] & 63; d = q[j] & 63; m = q[j + 4] & 63;
@ -407,19 +470,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
} }
} }
#endif
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
const block_q4_K * x = (const block_q4_K *) vx; const block_q4_K * x = (const block_q4_K *) vx;
const int i = blockIdx.x; const int i = blockIdx.x;
//// assume 64 threads - this is very slightly better than the one below #if QK_K == 256
//const int tid = threadIdx.x;
//const int il = tid/16;
//const int ir = tid%16;
//const int is = 2*il;
//const int n = 2;
// assume 32 threads // assume 32 threads
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int il = tid/8; const int il = tid/8;
@ -443,6 +501,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
y[l + 0] = d1 * (q[l] & 0xF) - m1; y[l + 0] = d1 * (q[l] & 0xF) - m1;
y[l +32] = d2 * (q[l] >> 4) - m2; y[l +32] = d2 * (q[l] >> 4) - m2;
} }
#else
const int tid = threadIdx.x;
const uint8_t * q = x[i].qs;
float * y = yy + i*QK_K;
const float d = (float)x[i].d[0];
const float m = (float)x[i].d[1];
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
#endif
} }
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
@ -450,6 +517,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
const int i = blockIdx.x; const int i = blockIdx.x;
#if QK_K == 256
// assume 64 threads - this is very slightly better than the one below // assume 64 threads - this is very slightly better than the one below
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int il = tid/16; // il is in 0...3 const int il = tid/16; // il is in 0...3
@ -476,12 +544,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
hm <<= 1; hm <<= 1;
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
#else
const int tid = threadIdx.x;
const uint8_t q = x[i].qs[tid];
const int im = tid/8; // 0...3
const int in = tid%8; // 0...7
const int is = tid/16; // 0 or 1
const uint8_t h = x[i].qh[in] >> im;
const float d = x[i].d;
float * y = yy + i*QK_K + tid;
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
#endif
} }
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
const block_q6_K * x = (const block_q6_K *) vx; const block_q6_K * x = (const block_q6_K *) vx;
const int i = blockIdx.x; const int i = blockIdx.x;
#if QK_K == 256
// assume 64 threads - this is very slightly better than the one below // assume 64 threads - this is very slightly better than the one below
const int tid = threadIdx.x; const int tid = threadIdx.x;
@ -501,6 +582,24 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
#else
// assume 32 threads
const int tid = threadIdx.x;
const int ip = tid/16; // 0 or 1
const int il = tid - 16*ip; // 0...15
float * y = yy + i*QK_K + 16*ip + il;
const float d = x[i].d;
const uint8_t ql = x[i].ql[16*ip + il];
const uint8_t qh = x[i].qh[il] >> (2*ip);
const int8_t * sc = x[i].scales;
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
#endif
} }
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
@ -515,6 +614,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
const block_q2_K * x = (const block_q2_K *)vx + ib0; const block_q2_K * x = (const block_q2_K *)vx + ib0;
float tmp = 0; // partial sum for thread in warp
#if QK_K == 256
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
@ -528,8 +630,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
const int s_offset = 8*im; const int s_offset = 8*im;
const int y_offset = 128*im + l0; const int y_offset = 128*im + l0;
float tmp = 0; // partial sum for thread in warp
uint32_t aux[4]; uint32_t aux[4];
const uint8_t * d = (const uint8_t *)aux; const uint8_t * d = (const uint8_t *)aux;
const uint8_t * m = (const uint8_t *)(aux + 2); const uint8_t * m = (const uint8_t *)(aux + 2);
@ -565,6 +665,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
tmp += dall * sum1 - dmin * sum2; tmp += dall * sum1 - dmin * sum2;
} }
#else
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
const int offset = tid * K_QUANTS_PER_ITERATION;
uint32_t uaux[2];
const uint8_t * d = (const uint8_t *)uaux;
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
const float * y = yy + i * QK_K + offset;
const uint8_t * q = x[i].qs + offset;
const uint32_t * s = (const uint32_t *)x[i].scales;
uaux[0] = s[0] & 0x0f0f0f0f;
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
const half2 * dh = (const half2 *)&x[i].d;
const float2 dall = __half22float2(dh[0]);
float sum1 = 0, sum2 = 0;
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
const uint8_t ql = q[l];
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
+ y[l+16] * d[1] * ((ql >> 2) & 3)
+ y[l+32] * d[2] * ((ql >> 4) & 3)
+ y[l+48] * d[3] * ((ql >> 6) & 3);
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
}
tmp += dall.x * sum1 - dall.y * sum2;
}
#endif
// sum up partial sums and write back result // sum up partial sums and write back result
__syncthreads(); __syncthreads();
@ -573,16 +706,13 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
} }
if (tid == 0) { if (threadIdx.x == 0) {
dst[row] = tmp; dst[row] = tmp;
} }
} }
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
const int row = blockIdx.y*blockDim.y + threadIdx.y; const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row > nrows) return; if (row > nrows) return;
@ -591,6 +721,13 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
const block_q3_K * x = (const block_q3_K *)vx + ib0; const block_q3_K * x = (const block_q3_K *)vx + ib0;
float tmp = 0; // partial sum for thread in warp
#if QK_K == 256
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
@ -610,8 +747,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
const uint16_t s_shift = 4*im; const uint16_t s_shift = 4*im;
float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
const float * y = yy + i * QK_K + y_offset; const float * y = yy + i * QK_K + y_offset;
@ -640,6 +775,34 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
tmp += d * sum; tmp += d * sum;
} }
#else
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
const int in = offset/8; // 0 or 1
const int im = offset%8; // 0...7
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
const float * y = yy + i * QK_K + offset;
const uint8_t * q = x[i].qs + offset;
const uint8_t * s = x[i].scales;
const float dall = (float)x[i].d;
float sum = 0;
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
const uint8_t hl = x[i].hmask[im+l] >> in;
const uint8_t ql = q[l];
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
}
tmp += sum;
}
#endif
// sum up partial sums and write back result // sum up partial sums and write back result
__syncthreads(); __syncthreads();
@ -648,22 +811,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
} }
if (tid == 0) { if (threadIdx.x == 0) {
dst[row] = tmp; dst[row] = tmp;
} }
} }
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int row = blockIdx.y*blockDim.y + threadIdx.y; const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row > nrows) return; if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K; const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row; const int ib0 = row*num_blocks_per_row;
const block_q4_K * x = (const block_q4_K *)vx + ib0;
#if QK_K == 256
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
@ -683,8 +849,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
uint16_t aux[4]; uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux; const uint8_t * sc = (const uint8_t *)aux;
const block_q4_K * x = (const block_q4_K *)vx + ib0;
float tmp = 0; // partial sum for thread in warp float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
@ -713,6 +877,36 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
} }
#else
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
const int step = tid * K_QUANTS_PER_ITERATION;
uint16_t aux16[2];
const uint8_t * s = (const uint8_t *)aux16;
float tmp = 0;
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
const uint8_t * q = x[i].qs + step;
const float * y = yy + i*QK_K + step;
const uint16_t * a = (const uint16_t *)x[i].scales;
aux16[0] = a[0] & 0x0f0f;
aux16[1] = (a[0] >> 4) & 0x0f0f;
const float d = (float)x[i].d[0];
const float m = (float)x[i].d[1];
float sum = 0.f;
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
}
tmp += sum;
}
#endif
// sum up partial sums and write back result // sum up partial sums and write back result
__syncthreads(); __syncthreads();
@ -728,15 +922,19 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) { static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int row = blockIdx.x; const int row = blockIdx.x;
const int num_blocks_per_row = ncols / QK_K; const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row; const int ib0 = row*num_blocks_per_row;
const block_q5_K * x = (const block_q5_K *)vx + ib0;
float tmp = 0; // partial sum for thread in warp
#if QK_K == 256
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int tid = threadIdx.x/2; // 0...15 const int tid = threadIdx.x/2; // 0...15
const int ix = threadIdx.x%2; const int ix = threadIdx.x%2;
@ -757,10 +955,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
uint16_t aux[4]; uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux; const uint8_t * sc = (const uint8_t *)aux;
const block_q5_K * x = (const block_q5_K *)vx + ib0;
float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += 2) { for (int i = ix; i < num_blocks_per_row; i += 2) {
const uint8_t * ql1 = x[i].qs + q_offset; const uint8_t * ql1 = x[i].qs + q_offset;
@ -793,9 +987,32 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
} }
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
} }
#else
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
const int step = tid * K_QUANTS_PER_ITERATION;
const int im = step/8;
const int in = step%8;
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
const uint8_t * q = x[i].qs + step;
const int8_t * s = x[i].scales;
const float * y = yy + i*QK_K + step;
const float d = x[i].d;
float sum = 0.f;
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
const uint8_t h = x[i].qh[in+j] >> im;
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
}
tmp += sum;
}
#endif
// sum up partial sums and write back result // sum up partial sums and write back result
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
@ -803,7 +1020,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
} }
if (tid == 0) { if (threadIdx.x == 0) {
dst[row] = tmp; dst[row] = tmp;
} }
} }
@ -820,6 +1037,8 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
const block_q6_K * x = (const block_q6_K *)vx + ib0; const block_q6_K * x = (const block_q6_K *)vx + ib0;
#if QK_K == 256
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
@ -874,6 +1093,37 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
} }
#else
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
const int step = tid * K_QUANTS_PER_ITERATION;
float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
const float * y = yy + i * QK_K + step;
const uint8_t * ql = x[i].ql + step;
const uint8_t * qh = x[i].qh + step;
const int8_t * s = x[i].scales;
const float d = x[i+0].d;
float sum = 0;
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
}
tmp += sum;
}
#endif
// sum up partial sums and write back result // sum up partial sums and write back result
__syncthreads(); __syncthreads();
#pragma unroll #pragma unroll
@ -1252,12 +1502,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y); dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
#else
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
} }
static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y); dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
#else
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
} }
static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@ -1267,12 +1525,20 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu
static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y); dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
#else
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
} }
static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K; const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y); dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
#else
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
} }
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@ -2553,6 +2819,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
tensor->backend = GGML_BACKEND_GPU; tensor->backend = GGML_BACKEND_GPU;
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
memset(extra, 0, sizeof(*extra));
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) || const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
tensor->op == GGML_OP_VIEW; tensor->op == GGML_OP_VIEW;

View file

@ -51,21 +51,21 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_f16);
GGML_METAL_DECL_KERNEL(get_rows_q4_0); GGML_METAL_DECL_KERNEL(get_rows_q4_0);
GGML_METAL_DECL_KERNEL(get_rows_q4_1); GGML_METAL_DECL_KERNEL(get_rows_q4_1);
GGML_METAL_DECL_KERNEL(get_rows_q2_k); GGML_METAL_DECL_KERNEL(get_rows_q2_K);
GGML_METAL_DECL_KERNEL(get_rows_q3_k); GGML_METAL_DECL_KERNEL(get_rows_q3_K);
GGML_METAL_DECL_KERNEL(get_rows_q4_k); GGML_METAL_DECL_KERNEL(get_rows_q4_K);
GGML_METAL_DECL_KERNEL(get_rows_q5_k); GGML_METAL_DECL_KERNEL(get_rows_q5_K);
GGML_METAL_DECL_KERNEL(get_rows_q6_k); GGML_METAL_DECL_KERNEL(get_rows_q6_K);
GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(norm);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32); GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32); GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32); GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32); GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(rope);
GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(alibi_f32);
GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
exit(1); exit(1);
} }
#ifdef GGML_QKK_64
MTLCompileOptions* options = [MTLCompileOptions new];
options.preprocessorMacros = @{ @"QK_K" : @(64) };
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
#else
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
#endif
if (error) { if (error) {
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
exit(1); exit(1);
@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_f16);
GGML_METAL_ADD_KERNEL(get_rows_q4_0); GGML_METAL_ADD_KERNEL(get_rows_q4_0);
GGML_METAL_ADD_KERNEL(get_rows_q4_1); GGML_METAL_ADD_KERNEL(get_rows_q4_1);
GGML_METAL_ADD_KERNEL(get_rows_q2_k); GGML_METAL_ADD_KERNEL(get_rows_q2_K);
GGML_METAL_ADD_KERNEL(get_rows_q3_k); GGML_METAL_ADD_KERNEL(get_rows_q3_K);
GGML_METAL_ADD_KERNEL(get_rows_q4_k); GGML_METAL_ADD_KERNEL(get_rows_q4_K);
GGML_METAL_ADD_KERNEL(get_rows_q5_k); GGML_METAL_ADD_KERNEL(get_rows_q5_K);
GGML_METAL_ADD_KERNEL(get_rows_q6_k); GGML_METAL_ADD_KERNEL(get_rows_q6_K);
GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(norm);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32); GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32); GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32); GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32); GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(rope);
GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(alibi_f32);
GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
nth0 = 4; nth0 = 4;
nth1 = 16; nth1 = 16;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
} break; } break;
case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_K:
{ {
@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
nth0 = 4; nth0 = 4;
nth1 = 16; nth1 = 16;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
} break; } break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
{ {
@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
nth0 = 4; nth0 = 4;
nth1 = 16; nth1 = 16;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
} break; } break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
{ {
@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
nth0 = 4; nth0 = 4;
nth1 = 16; nth1 = 16;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
} break; } break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
{ {
@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
nth0 = 4; nth0 = 4;
nth1 = 16; nth1 = 16;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
} break; } break;
default: default:
{ {
@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break; case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break; case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break; case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break; case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break; case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
default: GGML_ASSERT(false && "not implemented"); default: GGML_ASSERT(false && "not implemented");
} }

View file

@ -428,7 +428,7 @@ kernel void kernel_mul_mat_q4_0_f32(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) { if (ith == 0) {
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0]; dst[r1*ne0 + r0] = sum[0];
} }
} }
@ -497,7 +497,7 @@ kernel void kernel_mul_mat_q4_1_f32(
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) { if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0]; dst[r1*ne0 + r0] = sum[0];
} }
} }
@ -775,47 +775,76 @@ kernel void kernel_cpy_f32_f32(
//============================================ k-quants ====================================================== //============================================ k-quants ======================================================
#ifndef QK_K
#define QK_K 256 #define QK_K 256
#else
static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
#endif
#if QK_K == 256
#define K_SCALE_SIZE 12
#else
#define K_SCALE_SIZE 4
#endif
typedef struct { typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants uint8_t qs[QK_K/4]; // quants
half d; // super-block scale for quantized scales half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins half dmin; // super-block scale for quantized mins
} block_q2_k; } block_q2_K;
// 84 bytes / block // 84 bytes / block
typedef struct { typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits uint8_t qs[QK_K/4]; // quants - low 2 bits
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits #if QK_K == 64
half d; // super-block scale uint8_t scales[2];
} block_q3_k; #else
// 110 bytes / block uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
#endif
half d; // super-block scale
} block_q3_K;
#if QK_K == 64
typedef struct {
half d[2]; // super-block scales/mins
uint8_t scales[2];
uint8_t qs[QK_K/2]; // 4-bit quants
} block_q4_K;
#else
typedef struct { typedef struct {
half d; // super-block scale for quantized scales half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins half dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_k; } block_q4_K;
// 144 bytes / block #endif
#if QK_K == 64
typedef struct {
half d; // super-block scales/mins
int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K;
#else
typedef struct { typedef struct {
half d; // super-block scale for quantized scales half d; // super-block scale for quantized scales
half dmin; // super-block scale for quantized mins half dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_k; } block_q5_K;
// 176 bytes / block // 176 bytes / block
#endif
typedef struct { typedef struct {
uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits int8_t scales[QK_K/16]; // scales, quantized with 8 bits
half d; // super-block scale half d; // super-block scale
} block_q6_k; } block_q6_K;
// 210 bytes / block // 210 bytes / block
static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
@ -836,7 +865,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
//========================================== dequantization ============================= //========================================== dequantization =============================
static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, int k) { static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) {
assert(k % QK_K == 0); assert(k % QK_K == 0);
const int nb = k / QK_K; const int nb = k / QK_K;
@ -847,6 +876,7 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
device const uint8_t * q = x[i].qs; device const uint8_t * q = x[i].qs;
#if QK_K == 256
int is = 0; int is = 0;
float dl, ml; float dl, ml;
for (int n = 0; n < QK_K; n += 128) { for (int n = 0; n < QK_K; n += 128) {
@ -865,14 +895,29 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
} }
q += 32; q += 32;
} }
#else
float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
for (int l = 0; l < 16; ++l) {
y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1;
y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2;
y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3;
y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4;
}
y += QK_K;
#endif
} }
} }
static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) { static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) {
assert(k % QK_K == 0); assert(k % QK_K == 0);
const int nb = k / QK_K; const int nb = k / QK_K;
#if QK_K == 256
const uint16_t kmask1 = 0x0303; const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f; const uint16_t kmask2 = 0x0f0f;
@ -918,22 +963,49 @@ static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, i
} }
q += 32; q += 32;
} }
} }
#else
for (int i = 0; i < nb; i++) {
const float d_all = (float)(x[i].d);
device const uint8_t * q = x[i].qs;
device const uint8_t * hm = x[i].hmask;
const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
for (int l = 0; l < 8; ++l) {
uint8_t h = hm[l];
y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
}
y += QK_K;
}
#endif
} }
static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) { static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) {
assert(k % QK_K == 0); assert(k % QK_K == 0);
const int nb = k / QK_K; const int nb = k / QK_K;
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
device const uint8_t * q = x[i].qs;
#if QK_K == 256
const float d = x[i].d; const float d = x[i].d;
const float min = x[i].dmin; const float min = x[i].dmin;
device const uint8_t * q = x[i].qs;
device const uint8_t * scales = x[i].scales; device const uint8_t * scales = x[i].scales;
int is = 0; int is = 0;
@ -945,14 +1017,29 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i
for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2;
q += 32; is += 2; q += 32; is += 2;
} }
#else
device const uint8_t * s = x[i].scales;
device const half2 * dh = (device const half2 *)x[i].d;
const float2 d = (float2)dh[0];
const float d1 = d[0] * (s[0] & 0xF);
const float d2 = d[0] * (s[1] & 0xF);
const float m1 = d[1] * (s[0] >> 4);
const float m2 = d[1] * (s[1] >> 4);
for (int l = 0; l < 32; ++l) {
y[l+ 0] = d1 * (q[l] & 0xF) - m1;
y[l+32] = d2 * (q[l] >> 4) - m2;
}
y += QK_K;
#endif
} }
} }
static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) { static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) {
assert(k % QK_K == 0); assert(k % QK_K == 0);
const int nb = k / QK_K; const int nb = k / QK_K;
#if QK_K == 256
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
const float d = (float)(x[i].d); const float d = (float)(x[i].d);
@ -973,10 +1060,32 @@ static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, i
u1 <<= 2; u2 <<= 2; u1 <<= 2; u2 <<= 2;
} }
} }
#else
for (int i = 0; i < nb; i++) {
const float d = (float)x[i].d;
device const uint8_t * ql = x[i].qs;
device const uint8_t * qh = x[i].qh;
device const int8_t * sc = x[i].scales;
for (int l = 0; l < 8; ++l) {
y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16));
y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16));
y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16));
y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16));
}
y += QK_K;
}
#endif
} }
static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) { static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) {
assert(k % QK_K == 0); assert(k % QK_K == 0);
const int nb = k / QK_K; const int nb = k / QK_K;
@ -988,6 +1097,7 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
const float d = x[i].d; const float d = x[i].d;
#if QK_K == 256
for (int n = 0; n < QK_K; n += 128) { for (int n = 0; n < QK_K; n += 128) {
for (int l = 0; l < 32; ++l) { for (int l = 0; l < 32; ++l) {
int is = l/16; int is = l/16;
@ -1005,10 +1115,23 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
qh += 32; qh += 32;
sc += 8; sc += 8;
} }
#else
for (int l = 0; l < 16; ++l) {
const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
y[l+ 0] = d * sc[0] * q1;
y[l+16] = d * sc[1] * q2;
y[l+32] = d * sc[2] * q3;
y[l+48] = d * sc[3] * q4;
}
y += 64;
#endif
} }
} }
kernel void kernel_get_rows_q2_k( kernel void kernel_get_rows_q2_K(
device const void * src0, device const void * src0,
device const int * src1, device const int * src1,
device float * dst, device float * dst,
@ -1019,12 +1142,12 @@ kernel void kernel_get_rows_q2_k(
const int i = tpig; const int i = tpig;
const int r = ((device int32_t *) src1)[i]; const int r = ((device int32_t *) src1)[i];
dequantize_row_q2_k( dequantize_row_q2_K(
(device const block_q2_k *) ((device char *) src0 + r*nb01), (device const block_q2_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00); (device float *) ((device char *) dst + i*nb1), ne00);
} }
kernel void kernel_get_rows_q3_k( kernel void kernel_get_rows_q3_K(
device const void * src0, device const void * src0,
device const int * src1, device const int * src1,
device float * dst, device float * dst,
@ -1035,12 +1158,12 @@ kernel void kernel_get_rows_q3_k(
const int i = tpig; const int i = tpig;
const int r = ((device int32_t *) src1)[i]; const int r = ((device int32_t *) src1)[i];
dequantize_row_q3_k( dequantize_row_q3_K(
(device const block_q3_k *) ((device char *) src0 + r*nb01), (device const block_q3_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00); (device float *) ((device char *) dst + i*nb1), ne00);
} }
kernel void kernel_get_rows_q4_k( kernel void kernel_get_rows_q4_K(
device const void * src0, device const void * src0,
device const int * src1, device const int * src1,
device float * dst, device float * dst,
@ -1051,12 +1174,12 @@ kernel void kernel_get_rows_q4_k(
const int i = tpig; const int i = tpig;
const int r = ((device int32_t *) src1)[i]; const int r = ((device int32_t *) src1)[i];
dequantize_row_q4_k( dequantize_row_q4_K(
(device const block_q4_k *) ((device char *) src0 + r*nb01), (device const block_q4_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00); (device float *) ((device char *) dst + i*nb1), ne00);
} }
kernel void kernel_get_rows_q5_k( kernel void kernel_get_rows_q5_K(
device const void * src0, device const void * src0,
device const int * src1, device const int * src1,
device float * dst, device float * dst,
@ -1067,12 +1190,12 @@ kernel void kernel_get_rows_q5_k(
const int i = tpig; const int i = tpig;
const int r = ((device int32_t *) src1)[i]; const int r = ((device int32_t *) src1)[i];
dequantize_row_q5_k( dequantize_row_q5_K(
(device const block_q5_k *) ((device char *) src0 + r*nb01), (device const block_q5_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00); (device float *) ((device char *) dst + i*nb1), ne00);
} }
kernel void kernel_get_rows_q6_k( kernel void kernel_get_rows_q6_K(
device const void * src0, device const void * src0,
device const int * src1, device const int * src1,
device float * dst, device float * dst,
@ -1083,14 +1206,14 @@ kernel void kernel_get_rows_q6_k(
const int i = tpig; const int i = tpig;
const int r = ((device int32_t *) src1)[i]; const int r = ((device int32_t *) src1)[i];
dequantize_row_q6_k( dequantize_row_q6_K(
(device const block_q6_k *) ((device char *) src0 + r*nb01), (device const block_q6_K *) ((device char *) src0 + r*nb01),
(device float *) ((device char *) dst + i*nb1), ne00); (device float *) ((device char *) dst + i*nb1), ne00);
} }
//====================================== dot products ========================= //====================================== dot products =========================
kernel void kernel_mul_mat_q2_k_f32( kernel void kernel_mul_mat_q2_K_f32(
device const void * src0, device const void * src0,
device const float * src1, device const float * src1,
device float * dst, device float * dst,
@ -1107,12 +1230,15 @@ kernel void kernel_mul_mat_q2_k_f32(
const int64_t r0 = tgpig.x; const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
device const block_q2_k * x = (device const block_q2_k *) src0 + r0*nb; device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10; device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y; const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y; const int ith = tptg.y*tpitg.x + tpitg.y;
float sumf = 0;
#if QK_K == 256
const int tid = tpitg.y; // 0...16 const int tid = tpitg.y; // 0...16
const int il = tid/4; // 0...3 const int il = tid/4; // 0...3
const int ir = tid%4; // 0...3 const int ir = tid%4; // 0...3
@ -1125,9 +1251,6 @@ kernel void kernel_mul_mat_q2_k_f32(
const int y_offset = 64*il + n*ir; const int y_offset = 64*il + n*ir;
const int q_offset = 32*ip + n*ir; const int q_offset = 32*ip + n*ir;
sum[ith] = 0.0f;
float sumf = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) { for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uint8_t * q = x[i].qs + q_offset; device const uint8_t * q = x[i].qs + q_offset;
@ -1140,7 +1263,6 @@ kernel void kernel_mul_mat_q2_k_f32(
device const float * y = yy + i*QK_K + y_offset; device const float * y = yy + i*QK_K + y_offset;
//float4 s = {0.f, 0.f, 0.f, 0.f};
float2 s = {0.f, 0.f}; float2 s = {0.f, 0.f};
float smin = 0; float smin = 0;
for (int l = 0; l < n; ++l) { for (int l = 0; l < n; ++l) {
@ -1155,25 +1277,38 @@ kernel void kernel_mul_mat_q2_k_f32(
sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin; sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;
} }
#else
const int il = 4 * tpitg.x;
uint32_t aux[2];
thread const uint8_t * d = (thread const uint8_t *)aux;
thread const uint8_t * m = (thread const uint8_t *)aux + 4;
for (int i = tpitg.y; i < nb; i += tptg.y) {
device const uint8_t * q = x[i].qs + il;
device const float * y = yy + i*QK_K + il;
const float dall = (float)x[i].d;
const float dmin = (float)x[i].dmin;
device const uint32_t * a = (device const uint32_t *)x[i].scales;
aux[0] = a[0] & 0x0f0f0f0f;
aux[1] = (a[0] >> 4) & 0x0f0f0f0f;
for (int l = 0; l < 4; ++l) {
sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0])
+ y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1])
+ y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2])
+ y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]);
}
}
#endif
sum[ith] = sumf; sum[ith] = sumf;
//int mask1 = (ith%4 == 0);
//int mask2 = (ith%16 == 0);
//threadgroup_barrier(mem_flags::mem_threadgroup);
//for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i];
//threadgroup_barrier(mem_flags::mem_threadgroup);
//for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i];
//threadgroup_barrier(mem_flags::mem_threadgroup);
//if (ith == 0) {
// for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
// dst[r1*ne0 + r0] = sum[0];
//}
// //
// Accumulate the sum from all threads in the threadgroup // Accumulate the sum from all threads in the threadgroup
// This version is slightly faster than the commented out one below,
// which I copy-pasted from ggerganov's q4_0 dot product for metal.
// //
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) { if (ith%4 == 0) {
@ -1190,7 +1325,7 @@ kernel void kernel_mul_mat_q2_k_f32(
} }
} }
kernel void kernel_mul_mat_q3_k_f32( kernel void kernel_mul_mat_q3_K_f32(
device const void * src0, device const void * src0,
device const float * src1, device const float * src1,
device float * dst, device float * dst,
@ -1203,23 +1338,25 @@ kernel void kernel_mul_mat_q3_k_f32(
uint2 tpitg[[thread_position_in_threadgroup]], uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) { uint2 tptg[[threads_per_threadgroup]]) {
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
const uint8_t m3 = 3;
const int8_t m4 = 4;
const int nb = ne00/QK_K; const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x; const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb; device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10; device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y; const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y; const int ith = tptg.y*tpitg.x + tpitg.y;
#if QK_K == 256
const uint8_t m3 = 3;
const int8_t m4 = 4;
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
const int tid = tpitg.y; // expecting 16 const int tid = tpitg.y; // expecting 16
const int ip = tid/8; // 0 or 1 const int ip = tid/8; // 0 or 1
const int il = tid/2 - 4*ip; // 0...3 const int il = tid/2 - 4*ip; // 0...3
@ -1273,6 +1410,39 @@ kernel void kernel_mul_mat_q3_k_f32(
//sum[ith] = sumf; //sum[ith] = sumf;
sum[ith] = sumf1 - 32.f*sumf2; sum[ith] = sumf1 - 32.f*sumf2;
#else
const int il = 4 * tpitg.x; // 0, 4, 8, 12
const int im = il/8; // 0, 0, 1, 1
const int in = il%8; // 0, 4, 0, 4
float sumf = 0;
for (int i = tpitg.y; i < nb; i += tptg.y) {
const float d_all = (float)(x[i].d);
device const uint8_t * q = x[i].qs + il;
device const uint8_t * h = x[i].hmask + in;
device const float * y = yy + i * QK_K + il;
const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
const float d2 = d_all * ((x[i].scales[0] >> 4) - 8);
const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
const float d4 = d_all * ((x[i].scales[1] >> 4) - 8);
for (int l = 0; l < 4; ++l) {
const uint8_t hm = h[l] >> im;
sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4))
+ y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4))
+ y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4))
+ y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4));
}
}
sum[ith] = sumf;
#endif
// //
// Accumulate the sum from all threads in the threadgroup // Accumulate the sum from all threads in the threadgroup
@ -1293,7 +1463,7 @@ kernel void kernel_mul_mat_q3_k_f32(
} }
kernel void kernel_mul_mat_q4_k_f32( kernel void kernel_mul_mat_q4_K_f32(
device const void * src0, device const void * src0,
device const float * src1, device const float * src1,
device float * dst, device float * dst,
@ -1305,21 +1475,25 @@ kernel void kernel_mul_mat_q4_k_f32(
uint2 tpitg[[thread_position_in_threadgroup]], uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) { uint2 tptg[[threads_per_threadgroup]]) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int nb = ne00/QK_K; const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x; const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y; const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y; const int ith = tptg.y*tpitg.x + tpitg.y;
device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
float sumf = 0;
#if QK_K == 256
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int tid = tpitg.y; // 0...16 const int tid = tpitg.y; // 0...16
const int il = tid/4; // 0...3 const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3 const int ir = tid - 4*il;// 0...3
@ -1332,11 +1506,8 @@ kernel void kernel_mul_mat_q4_k_f32(
const int q_offset = 32*im + l0; const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0; const int y_offset = 64*im + l0;
sum[ith] = 0.0f;
uchar2 sc1, sc2, sc3, sc4; uchar2 sc1, sc2, sc3, sc4;
float sumf = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) { for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uint8_t * q1 = (x + i)->qs + q_offset; device const uint8_t * q1 = (x + i)->qs + q_offset;
@ -1365,6 +1536,30 @@ kernel void kernel_mul_mat_q4_k_f32(
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin; sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
} }
#else
uint16_t aux16[2];
thread const uint8_t * scales = (thread const uint8_t *)aux16;
const int il = 4*tpitg.x;
for (int i = tpitg.y; i < nb; i += tptg.y) {
device const uint8_t * q = x[i].qs + il;
device const float * y = yy + i * QK_K + il;
const float d = (float)x[i].d[0];
const float m = (float)x[i].d[1];
device const uint16_t * a = (device const uint16_t *)x[i].scales;
aux16[0] = a[0] & 0x0f0f;
aux16[1] = (a[0] >> 4) & 0x0f0f;
for (int l = 0; l < 4; ++l) {
sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
+ d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]);
}
}
#endif
sum[ith] = sumf; sum[ith] = sumf;
@ -1401,7 +1596,7 @@ kernel void kernel_mul_mat_q4_k_f32(
//} //}
} }
kernel void kernel_mul_mat_q5_k_f32( kernel void kernel_mul_mat_q5_K_f32(
device const void * src0, device const void * src0,
device const float * src1, device const float * src1,
device float * dst, device float * dst,
@ -1413,21 +1608,25 @@ kernel void kernel_mul_mat_q5_k_f32(
uint2 tpitg[[thread_position_in_threadgroup]], uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) { uint2 tptg[[threads_per_threadgroup]]) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int nb = ne00/QK_K; const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x; const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb; device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10; device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y; const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y; const int ith = tptg.y*tpitg.x + tpitg.y;
float sumf = 0;
#if QK_K == 256
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int tid = tpitg.y; // 0...16 const int tid = tpitg.y; // 0...16
const int il = tid/4; // 0...3 const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3 const int ir = tid - 4*il;// 0...3
@ -1447,7 +1646,6 @@ kernel void kernel_mul_mat_q5_k_f32(
uchar2 sc1, sc2, sc3, sc4; uchar2 sc1, sc2, sc3, sc4;
float sumf = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) { for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uint8_t * q1 = (x + i)->qs + q_offset; device const uint8_t * q1 = (x + i)->qs + q_offset;
@ -1479,6 +1677,28 @@ kernel void kernel_mul_mat_q5_k_f32(
sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin; sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
} }
#else
const int il = 4 * tpitg.x; // 0, 4, 8, 12
const int im = il/8; // 0, 0, 1, 1
const int in = il%8; // 0, 4, 0, 4
for (int i = tpitg.y; i < nb; i += tptg.y) {
const float d = (float)x[i].d;
device const uint8_t * q = x[i].qs + il;
device const uint8_t * h = x[i].qh + in;
device const int8_t * s = x[i].scales;
device const float * y = yy + i*QK_K + il;
for (int l = 0; l < 4; ++l) {
const uint8_t hl = h[l] >> im;
sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
+ y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
+ y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16))
+ y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16));
}
}
#endif
sum[ith] = sumf; sum[ith] = sumf;
// //
@ -1500,7 +1720,7 @@ kernel void kernel_mul_mat_q5_k_f32(
} }
kernel void kernel_mul_mat_q6_k_f32( kernel void kernel_mul_mat_q6_K_f32(
device const void * src0, device const void * src0,
device const float * src1, device const float * src1,
device float * dst, device float * dst,
@ -1522,12 +1742,15 @@ kernel void kernel_mul_mat_q6_k_f32(
const int64_t r0 = tgpig.x; const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb; device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
device const float * yy = (device const float *) src1 + r1*ne10; device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y; const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y; const int ith = tptg.y*tpitg.x + tpitg.y;
float sumf = 0;
#if QK_K == 256
// Note: we absolutely assume that tptg.y = 16 and QK_K = 256! // Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
const int iqs = 16 * tpitg.y; const int iqs = 16 * tpitg.y;
const int ip = iqs / 128; // 0 or 1 const int ip = iqs / 128; // 0 or 1
@ -1540,7 +1763,6 @@ kernel void kernel_mul_mat_q6_k_f32(
const int q_offset_l = 64*ip + l0; const int q_offset_l = 64*ip + l0;
const int q_offset_h = 32*ip + l0; const int q_offset_h = 32*ip + l0;
float sumf = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) { for (int i = tpitg.x; i < nb; i += tptg.x) {
device const uint8_t * ql = x[i].ql + q_offset_l; device const uint8_t * ql = x[i].ql + q_offset_l;
@ -1562,6 +1784,28 @@ kernel void kernel_mul_mat_q6_k_f32(
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]); sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
} }
#else
const int il = 4*tpitg.x; // 0, 4, 8, 12
for (int i = tpitg.y; i < nb; i += tptg.y) {
device const float * y = yy + i * QK_K + il;
device const uint8_t * ql = x[i].ql + il;
device const uint8_t * qh = x[i].qh + il;
device const int8_t * s = x[i].scales;
const float d = x[i].d;
float4 sums = {0.f, 0.f, 0.f, 0.f};
for (int l = 0; l < 4; ++l) {
sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
}
sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
}
#endif
sum[ith] = sumf; sum[ith] = sumf;

598
ggml.c
View file

@ -91,6 +91,11 @@ static int sched_yield (void) {
#include <stdatomic.h> #include <stdatomic.h>
typedef void* thread_ret_t; typedef void* thread_ret_t;
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#endif #endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@ -119,6 +124,30 @@ typedef void* thread_ret_t;
#define GGML_SOFT_MAX_UNROLL 4 #define GGML_SOFT_MAX_UNROLL 4
#define GGML_VEC_DOT_UNROLL 2 #define GGML_VEC_DOT_UNROLL 2
//
// logging
//
#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif
#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif
#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif
#define GGML_PRINT(...) printf(__VA_ARGS__)
#ifdef GGML_USE_ACCELERATE #ifdef GGML_USE_ACCELERATE
// uncomment to use vDSP for soft max computation // uncomment to use vDSP for soft max computation
// note: not sure if it is actually faster // note: not sure if it is actually faster
@ -459,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
} }
} }
// //
// timing // timing
// //
@ -522,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
#define ggml_perf_cycles_per_ms() 0 #define ggml_perf_cycles_per_ms() 0
#endif #endif
// //
// cache line // cache line
// //
@ -3843,12 +3872,31 @@ struct ggml_context_container {
struct ggml_context context; struct ggml_context context;
}; };
//
// NUMA support
//
#define GGML_NUMA_MAX_NODES 8
#define GGML_NUMA_MAX_CPUS 512
struct ggml_numa_node {
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
uint32_t n_cpus;
};
struct ggml_numa_nodes {
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
};
// //
// ggml state // ggml state
// //
struct ggml_state { struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_numa_nodes numa;
}; };
// global state // global state
@ -3873,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
atomic_fetch_sub(&g_state_barrier, 1); atomic_fetch_sub(&g_state_barrier, 1);
} }
void ggml_numa_init(void) {
if (g_state.numa.n_nodes > 0) {
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
return;
}
#ifdef __linux__
struct stat st;
char path[256];
int rv;
// enumerate nodes
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) != 0) { break; }
++g_state.numa.n_nodes;
}
// enumerate CPUs
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) != 0) { break; }
++g_state.numa.total_cpus;
}
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
g_state.numa.n_nodes = 0;
return;
}
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
struct ggml_numa_node * node = &g_state.numa.nodes[n];
GGML_PRINT_DEBUG("CPUs on node %u:", n);
node->n_cpus = 0;
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) == 0) {
node->cpus[node->n_cpus++] = c;
GGML_PRINT_DEBUG(" %u", c);
}
}
GGML_PRINT_DEBUG("\n");
}
if (ggml_is_numa()) {
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
if (fptr != NULL) {
char buf[42];
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
}
fclose(fptr);
}
}
#else
// TODO
#endif
}
bool ggml_is_numa(void) {
return g_state.numa.n_nodes > 1;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void ggml_print_object(const struct ggml_object * obj) { void ggml_print_object(const struct ggml_object * obj) {
@ -4129,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
g_state = (struct ggml_state) { g_state = (struct ggml_state) {
/*.contexts =*/ { { 0 } }, /*.contexts =*/ { { 0 } },
/*.numa =*/ {
.n_nodes = 0,
.total_cpus = 0,
},
}; };
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) { for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@ -6657,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
int n_past, int n_past,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx,
bool inplace) { bool inplace) {
GGML_ASSERT(n_past >= 0); GGML_ASSERT(n_past >= 0);
bool is_node = false; bool is_node = false;
@ -6669,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
ggml_scratch_save(ctx); ggml_scratch_save(ctx);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode; ((int32_t *) b->data)[2] = mode;
((int32_t *) b->data)[3] = n_ctx;
ggml_scratch_load(ctx); ggml_scratch_load(ctx);
@ -6690,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_dims, int n_dims,
int mode) { int mode,
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false); int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
} }
struct ggml_tensor * ggml_rope_inplace( struct ggml_tensor * ggml_rope_inplace(
@ -6699,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_dims, int n_dims,
int mode) { int mode,
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true); int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
} }
// ggml_rope_back // ggml_rope_back
@ -12319,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 3); GGML_ASSERT(ggml_nelements(src1) == 4);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
@ -12328,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
assert(n_past >= 0); assert(n_past >= 0);
@ -12372,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
const float theta_scale = powf(10000.0, -2.0f/n_dims); const float theta_scale = powf(10000.0, -2.0f/n_dims);
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@ -12382,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
float theta = (float)p; float theta = (float)p;
if (!is_neox) { if (is_glm) {
theta = MIN(p, n_ctx - 2);
float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta);
theta *= theta_scale;
block_theta *= theta_scale;
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[n_dims/2];
const float x2 = src[n_dims];
const float x3 = src[n_dims/2*3];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
}
} else if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
@ -12432,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 3); GGML_ASSERT(ggml_nelements(src1) == 4);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
@ -12441,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
assert(n_past >= 0); assert(n_past >= 0);
@ -12485,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
const float theta_scale = powf(10000.0, -2.0f/n_dims); const float theta_scale = powf(10000.0, -2.0f/n_dims);
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) { for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@ -12495,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
float theta = (float)p; float theta = (float)p;
if (!is_neox) { if (is_glm) {
theta = MIN(p, n_ctx - 2);
float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta);
theta *= theta_scale;
block_theta *= theta_scale;
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
}
} if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta); const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta); const float sin_theta = sinf(theta);
@ -13387,8 +13566,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
const int nk1 = ne01; const int nk1 = ne01;
// size of the convolution row - the kernel size unrolled across all channels // size of the convolution row - the kernel size unrolled across all channels
// round-up so it is more suitable for SIMD const int ew0 = nk0*nk1*ne02;
const int ew0 = ggml_up32(nk0*nk1*ne02);
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float));
@ -16069,17 +16247,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
if (src0->grad) { if (src0->grad) {
assert(src1->type == GGML_TYPE_I32); assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3); assert(ggml_nelements(src1) == 4);
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
src0->grad = ggml_add_impl(ctx, src0->grad = ggml_add_impl(ctx,
src0->grad, src0->grad,
ggml_rope(ctx, ggml_rope(ctx,
tensor->grad, tensor->grad,
n_past, n_past,
n_dims, n_dims,
mode), mode,
n_ctx),
inplace); inplace);
} }
if (src1->grad) { if (src1->grad) {
@ -16504,68 +16684,172 @@ typedef pthread_t ggml_thread_t;
#endif #endif
#ifdef __linux__
void set_numa_thread_affinity(int thread_n, int n_threads) {
if (!ggml_is_numa()) {
return;
}
// run thread on node_num thread_n / (threads per node)
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (size_t i = 0; i < node->n_cpus; ++i) {
CPU_SET_S(node->cpus[i], setsize, cpus);
}
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
}
CPU_FREE(cpus);
}
void clear_numa_thread_affinity(void) {
if (!ggml_is_numa()) {
return;
}
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
CPU_SET_S(i, setsize, cpus);
}
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
}
CPU_FREE(cpus);
}
#else
// TODO: Windows etc.
// (the linux implementation may also work on BSD, someone should test)
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
void clear_numa_thread_affinity(void) {}
#endif
struct ggml_compute_state_shared { struct ggml_compute_state_shared {
ggml_lock_t spin; struct ggml_cgraph * cgraph;
int64_t perf_node_start_cycles;
int64_t perf_node_start_time_us;
int n_threads; int n_threads;
// synchronization primitives // synchronization primitives
atomic_int n_ready; atomic_int n_active; // num active threads
atomic_bool has_work; atomic_int node_n; // active graph node
atomic_bool stop; // stop all threads
}; };
struct ggml_compute_state { struct ggml_compute_state {
ggml_thread_t thrd; ggml_thread_t thrd;
int ith;
struct ggml_compute_params params;
struct ggml_tensor * node;
struct ggml_compute_state_shared * shared; struct ggml_compute_state_shared * shared;
}; };
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
node->perf_runs++;
node->perf_cycles += cycles_cur;
node->perf_time_us += time_us_cur;
}
static thread_ret_t ggml_graph_compute_thread(void * data) { static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_state * state = (struct ggml_compute_state *) data;
struct ggml_cgraph * cgraph = state->shared->cgraph;
const int n_threads = state->shared->n_threads; const int n_threads = state->shared->n_threads;
set_numa_thread_affinity(state->ith, n_threads);
int node_n = -1;
while (true) { while (true) {
if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) { if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
atomic_store(&state->shared->has_work, false); // all other threads are finished and spinning
} else { // do finalize and init here so we don't have synchronize again
while (atomic_load(&state->shared->has_work)) { struct ggml_compute_params params = {
if (atomic_load(&state->shared->stop)) { /*.type =*/ GGML_TASK_FINALIZE,
return 0; /*.ith =*/ 0,
/*.nth =*/ 0,
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
};
if (node_n != -1) {
/* FINALIZE */
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
// distribute new work or execute it direct if 1T
while (++node_n < cgraph->n_nodes) {
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
struct ggml_tensor * node = cgraph->nodes[node_n];
state->shared->perf_node_start_cycles = ggml_perf_cycles();
state->shared->perf_node_start_time_us = ggml_perf_time_us();
/* INIT */
params.type = GGML_TASK_INIT;
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
if (node->n_tasks == 1) {
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
// they do something more efficient than spinning (?)
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
} else {
break;
} }
ggml_lock_lock (&state->shared->spin);
ggml_lock_unlock(&state->shared->spin);
} }
}
atomic_fetch_sub(&state->shared->n_ready, 1); atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_n, node_n);
// wait for work } else {
while (!atomic_load(&state->shared->has_work)) { // wait for other threads to finish
if (atomic_load(&state->shared->stop)) { const int last = node_n;
return 0; do {
} sched_yield();
ggml_lock_lock (&state->shared->spin); node_n = atomic_load(&state->shared->node_n);
ggml_lock_unlock(&state->shared->spin); } while (node_n == last);
} }
// check if we should stop // check if we should stop
if (atomic_load(&state->shared->stop)) { if (node_n >= cgraph->n_nodes) break;
break;
}
if (state->node) { /* COMPUTE */
if (state->params.ith < state->params.nth) { struct ggml_tensor * node = cgraph->nodes[node_n];
ggml_compute_forward(&state->params, state->node);
}
state->node = NULL; struct ggml_compute_params params = {
} else { /*.type =*/ GGML_TASK_COMPUTE,
break; /*.ith =*/ state->ith,
/*.nth =*/ node->n_tasks,
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
};
if (state->ith < node->n_tasks) {
ggml_compute_forward(&params, node);
} }
} }
@ -16576,39 +16860,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
const int n_threads = cgraph->n_threads; const int n_threads = cgraph->n_threads;
struct ggml_compute_state_shared state_shared = { struct ggml_compute_state_shared state_shared = {
/*.spin =*/ GGML_LOCK_INITIALIZER, /*.cgraph =*/ cgraph,
/*.n_threads =*/ n_threads, /*.perf_node_start_cycles =*/ 0,
/*.n_ready =*/ 0, /*.perf_node_start_time_us =*/ 0,
/*.has_work =*/ false, /*.n_threads =*/ n_threads,
/*.stop =*/ false, /*.n_active =*/ n_threads,
/*.node_n =*/ -1,
}; };
struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
// create thread pool
if (n_threads > 1) {
ggml_lock_init(&state_shared.spin);
atomic_store(&state_shared.has_work, true);
for (int j = 0; j < n_threads - 1; j++) {
workers[j] = (struct ggml_compute_state) {
.thrd = 0,
.params = {
.type = GGML_TASK_COMPUTE,
.ith = j + 1,
.nth = n_threads,
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
.wdata = cgraph->work ? cgraph->work->data : NULL,
},
.node = NULL,
.shared = &state_shared,
};
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
}
// initialize tasks + work buffer // initialize tasks + work buffer
{ {
@ -16752,7 +17011,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} break; } break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
{ {
node->n_tasks = n_threads; node->n_tasks = 1;
} break; } break;
case GGML_OP_SET: case GGML_OP_SET:
case GGML_OP_CONT: case GGML_OP_CONT:
@ -16956,166 +17215,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} }
} }
// create thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; ++j) {
workers[j] = (struct ggml_compute_state) {
.thrd = 0,
.ith = j,
.shared = &state_shared,
};
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
}
}
workers[0].ith = 0;
workers[0].shared = &state_shared;
const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us(); const int64_t perf_start_time_us = ggml_perf_time_us();
for (int i = 0; i < cgraph->n_nodes; i++) { // this is a work thread too
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); ggml_graph_compute_thread(&workers[0]);
struct ggml_tensor * node = cgraph->nodes[i]; // don't leave affinity set on the main thread
clear_numa_thread_affinity();
// TODO: this could be used to avoid unnecessary computations, but it needs to be improved
//if (node->grad == NULL && node->perf_runs > 0) {
// continue;
//}
const int64_t perf_node_start_cycles = ggml_perf_cycles();
const int64_t perf_node_start_time_us = ggml_perf_time_us();
// INIT
struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_INIT,
/*.ith =*/ 0,
/*.nth =*/ node->n_tasks,
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
};
ggml_compute_forward(&params, node);
// COMPUTE
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
// launch thread pool
for (int j = 0; j < n_threads - 1; j++) {
workers[j].params = (struct ggml_compute_params) {
.type = GGML_TASK_COMPUTE,
.ith = j + 1,
.nth = node->n_tasks,
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
.wdata = cgraph->work ? cgraph->work->data : NULL,
};
workers[j].node = node;
}
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) > 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
atomic_store(&state_shared.has_work, true);
}
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
// wait for thread pool
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) != 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
}
// FINALIZE
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
// launch thread pool
for (int j = 0; j < n_threads - 1; j++) {
workers[j].params = (struct ggml_compute_params) {
.type = GGML_TASK_FINALIZE,
.ith = j + 1,
.nth = node->n_tasks,
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
.wdata = cgraph->work ? cgraph->work->data : NULL,
};
workers[j].node = node;
}
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) > 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
atomic_store(&state_shared.has_work, true);
}
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
// wait for thread pool
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) != 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
}
// performance stats (node)
{
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
node->perf_runs++;
node->perf_cycles += perf_cycles_cur;
node->perf_time_us += perf_time_us_cur;
}
}
// join thread pool // join thread pool
if (n_threads > 1) { if (n_threads > 1) {
atomic_store(&state_shared.stop, true); for (int j = 1; j < n_threads; j++) {
atomic_store(&state_shared.has_work, true); const int rc = ggml_thread_join(workers[j].thrd, NULL);
for (int j = 0; j < n_threads - 1; j++) {
int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0); GGML_ASSERT(rc == 0);
UNUSED(rc);
} }
ggml_lock_destroy(&state_shared.spin);
} }
// performance stats (graph) // performance stats (graph)

12
ggml.h
View file

@ -198,7 +198,7 @@
#define GGML_MAX_PARAMS 256 #define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4 #define GGML_MAX_OPT 4
#define GGML_MAX_NAME 32 #define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#define GGML_ASSERT(x) \ #define GGML_ASSERT(x) \
@ -469,6 +469,9 @@ extern "C" {
GGML_API int64_t ggml_cycles(void); GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void); GGML_API int64_t ggml_cycles_per_ms(void);
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx); GGML_API void ggml_print_objects(const struct ggml_context * ctx);
@ -1033,13 +1036,15 @@ extern "C" {
// rotary position embedding // rotary position embedding
// if mode & 1 == 1, skip n_past elements // if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style // if mode & 2 == 1, GPT-NeoX style
// if mode & 4 == 1, ChatGLM style
// TODO: avoid creating a new tensor every time // TODO: avoid creating a new tensor every time
GGML_API struct ggml_tensor * ggml_rope( GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_dims, int n_dims,
int mode); int mode,
int n_ctx);
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_inplace( GGML_API struct ggml_tensor * ggml_rope_inplace(
@ -1047,7 +1052,8 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_dims, int n_dims,
int mode); int mode,
int n_ctx);
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
// a - dy // a - dy

1688
k_quants.c

File diff suppressed because it is too large Load diff

View file

@ -7,7 +7,13 @@
#include <stddef.h> #include <stddef.h>
// Super-block size // Super-block size
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256 #define QK_K 256
#define K_SCALE_SIZE 12
#endif
// //
// Super-block quantization structures // Super-block quantization structures
@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
// weight is represented as x = a * q // weight is represented as x = a * q
// 16 blocks of 16 elemenets each // 16 blocks of 16 elemenets each
// Effectively 3.4375 bits per weight // Effectively 3.4375 bits per weight
#ifdef GGML_QKK_64
typedef struct { typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits uint8_t qs[QK_K/4]; // quants - low 2 bits
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t scales[2];
ggml_fp16_t d; // super-block scale ggml_fp16_t d; // super-block scale
} block_q3_K; } block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
#else
typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
uint8_t scales[12]; // scales, quantized with 6 bits
ggml_fp16_t d; // super-block scale
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
#endif
// 4-bit quantization // 4-bit quantization
// 16 blocks of 32 elements each // 16 blocks of 32 elements each
// weight is represented as x = a * q + b // weight is represented as x = a * q + b
// Effectively 4.5 bits per weight // Effectively 4.5 bits per weight
#ifdef GGML_QKK_64
typedef struct {
ggml_fp16_t d[2]; // super-block scales/mins
uint8_t scales[2]; // 4-bit block scales/mins
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct { typedef struct {
ggml_fp16_t d; // super-block scale for quantized scales ggml_fp16_t d; // super-block scale for quantized scales
ggml_fp16_t dmin; // super-block scale for quantized mins ggml_fp16_t dmin; // super-block scale for quantized mins
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K; } block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
#endif
// 5-bit quantization // 5-bit quantization
// 16 blocks of 32 elements each // 16 blocks of 32 elements each
// weight is represented as x = a * q + b // weight is represented as x = a * q + b
// Effectively 5.5 bits per weight // Effectively 5.5 bits per weight
#ifdef GGML_QKK_64
typedef struct { typedef struct {
ggml_fp16_t d; // super-block scale for quantized scales ggml_fp16_t d; // super-block scale
ggml_fp16_t dmin; // super-block scale for quantized mins int8_t scales[QK_K/16]; // 8-bit block scales
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K; } block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
ggml_fp16_t d; // super-block scale for quantized scales
ggml_fp16_t dmin; // super-block scale for quantized mins
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif
// 6-bit quantization // 6-bit quantization
// weight is represented as x = a * q // weight is represented as x = a * q

View file

@ -172,12 +172,14 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES #ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) { llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size; size = file->size;
int fd = fileno(file->fp); int fd = fileno(file->fp);
int flags = MAP_SHARED; int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__ #ifdef __linux__
flags |= MAP_POPULATE; if (prefetch) { flags |= MAP_POPULATE; }
#endif #endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) { if (addr == MAP_FAILED) {
@ -191,6 +193,14 @@ struct llama_mmap {
strerror(errno)); strerror(errno));
} }
} }
if (numa) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
} }
~llama_mmap() { ~llama_mmap() {
@ -199,7 +209,9 @@ struct llama_mmap {
#elif defined(_WIN32) #elif defined(_WIN32)
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true) { llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
(void) numa;
size = file->size; size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@ -248,8 +260,10 @@ struct llama_mmap {
#else #else
static constexpr bool SUPPORTED = false; static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *, bool prefetch = true) { llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
(void)prefetch; (void) prefetch;
(void) numa;
throw std::runtime_error(std::string("mmap not supported")); throw std::runtime_error(std::string("mmap not supported"));
} }
#endif #endif

View file

@ -21,9 +21,13 @@
#endif #endif
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
#ifndef QK_K #ifndef QK_K
#ifdef GGML_QKK_64
#define QK_K 64
#else
#define QK_K 256 #define QK_K 256
#endif #endif
#endif #endif
#endif
#include <array> #include <array>
#include <ctime> #include <ctime>
@ -770,7 +774,7 @@ struct llama_model_loader {
} }
if (use_mmap) { if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size)); mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
if (lmlock) { if (lmlock) {
lmlock->init(mapping->addr); lmlock->init(mapping->addr);
} }
@ -973,7 +977,7 @@ bool llama_mlock_supported() {
return llama_mlock::SUPPORTED; return llama_mlock::SUPPORTED;
} }
void llama_init_backend() { void llama_init_backend(bool numa) {
ggml_time_init(); ggml_time_init();
// needed to initialize f16 tables // needed to initialize f16 tables
@ -982,6 +986,10 @@ void llama_init_backend() {
struct ggml_context * ctx = ggml_init(params); struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx); ggml_free(ctx);
} }
if (numa) {
ggml_numa_init();
}
} }
int64_t llama_time_us() { int64_t llama_time_us() {
@ -1483,11 +1491,11 @@ static bool llama_eval_internal(
offload_func_kq(tmpq); offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq"); ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
offload_func_kq(Kcur); offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur"); ggml_set_name(Kcur, "Kcur");
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
offload_func_kq(Qcur); offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur"); ggml_set_name(Qcur, "Qcur");
@ -2470,6 +2478,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<std::thread> workers; std::vector<std::thread> workers;
std::mutex mutex; std::mutex mutex;
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
};
size_t idx = 0; size_t idx = 0;
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
llama_buffer read_data; llama_buffer read_data;
@ -2524,15 +2536,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 || use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
++i_attention_wv; ++i_attention_wv;
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 || use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
++i_feed_forward_w2; ++i_feed_forward_w2;
} else if (tensor.name.find("attention.wo.weight") != std::string::npos) { } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
@ -2890,7 +2903,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
// maybe this should in llama_model_loader // maybe this should in llama_model_loader
if (model_loader->use_mmap) { if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0)); model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
} }
} }

View file

@ -140,8 +140,9 @@ extern "C" {
// TODO: not great API - very likely to change // TODO: not great API - very likely to change
// Initialize the llama + ggml backend // Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations
// Call once at the start of the program // Call once at the start of the program
LLAMA_API void llama_init_backend(); LLAMA_API void llama_init_backend(bool numa);
LLAMA_API int64_t llama_time_us(); LLAMA_API int64_t llama_time_us();