diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
index c01006efe..f6073f662 100644
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
- apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+ apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 6d5943a2f..6f19afa9c 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
- apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+ apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
index 23f428944..2aec4a85d 100644
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -23,10 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+RUN apt-get update && \
+ apt-get install -y libgomp1
+
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]
diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile
index 37576d68e..dcaeb3e72 100644
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
ENTRYPOINT [ "/app/main" ]
diff --git a/.devops/main-vulkan.Dockerfile b/.devops/main-vulkan.Dockerfile
index 6c2b2ed5b..1bdb52803 100644
--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
+RUN apt update && apt install -y git build-essential cmake wget libgomp1
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
index 763d75fce..d2514c4ba 100644
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -9,10 +9,13 @@ WORKDIR /app
COPY . .
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
FROM ubuntu:$UBUNTU_VERSION as runtime
+RUN apt-get update && \
+ apt-get install -y libgomp1
+
COPY --from=build /app/main /main
ENV LC_ALL=C.utf8
diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
index 7f5228185..4e9747b82 100644
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -25,12 +25,12 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
- apt-get install -y libcurl4-openssl-dev
+ apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server
diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
index 0d09d3627..bee63b966 100644
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -11,12 +11,12 @@ COPY . .
ENV LLAMA_CURL=1
-RUN make -j$(nproc)
+RUN make -j$(nproc) server
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
- apt-get install -y libcurl4-openssl-dev
+ apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5576c26e1..3e641c19b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -424,6 +424,8 @@ if (LLAMA_CUDA)
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@@ -596,6 +598,8 @@ if (LLAMA_HIPBLAS)
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
diff --git a/Makefile b/Makefile
index adc9fa434..b7506aa66 100644
--- a/Makefile
+++ b/Makefile
@@ -454,6 +454,7 @@ ifdef LLAMA_CUBLAS
endif
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
ifdef LLAMA_CUDA_FA_ALL_QUANTS
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
else
diff --git a/README.md b/README.md
index 9d2a59d89..09e8cad31 100644
--- a/README.md
+++ b/README.md
@@ -598,7 +598,7 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 0ec8d6d8d..171530c91 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -84,4 +84,4 @@ endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/common/common.cpp b/common/common.cpp
index c8df9a4ce..cdcb352b5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -273,6 +273,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
} catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
+ params = params_org;
return false;
}
@@ -408,6 +409,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
+ if (arg == "--in-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ params.in_files.push_back(argv[i]);
+ return true;
+ }
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
if (++i >= argc) {
invalid_param = true;
@@ -1081,7 +1096,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
if (arg == "-v" || arg == "--verbose") {
- params.verbose = true;
+ params.verbosity = 1;
+ return true;
+ }
+ if (arg == "--verbosity") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.verbosity = std::stoi(argv[i]);
return true;
}
if (arg == "--verbose-prompt") {
@@ -1391,6 +1414,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.timeout_write = std::stoi(argv[i]);
return true;
}
+ if (arg == "--threads-http") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_threads_http = std::stoi(argv[i]);
+ return true;
+ }
if (arg == "-spf" || arg == "--system-prompt-file") {
if (++i >= argc) {
invalid_param = true;
@@ -1537,6 +1568,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.i_pos = std::stoi(argv[i]);
return true;
}
+ if (arg == "-o" || arg == "--output" || arg == "--output-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.out_file = argv[i];
+ return true;
+ }
+ if (arg == "-ofreq" || arg == "--output-frequency") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_out_freq = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--save-frequency") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_save_freq = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--process-output") {
+ params.process_output = true;
+ return true;
+ }
+ if (arg == "--no-ppl") {
+ params.compute_ppl = false;
+ return true;
+ }
+ if (arg == "--chunk" || arg == "--from-chunk") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.i_chunk = std::stoi(argv[i]);
+ return true;
+ }
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@@ -1612,6 +1683,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
options.push_back({ "*", " --version", "show version and build info" });
options.push_back({ "*", "-v, --verbose", "print verbose information" });
+ options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
@@ -1637,6 +1709,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
+ options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
options.push_back({ "*", " --no-escape", "do not process escape sequences" });
@@ -1804,6 +1877,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
+ options.push_back({ "imatrix" });
+ options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
+ options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
+ options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
+ options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
+ options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
+ options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
+
options.push_back({ "bench" });
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
@@ -1820,6 +1901,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
+ options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
options.push_back({ "server", " --system-prompt-file FNAME",
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
options.push_back({ "server", " --log-format {text,json}",
diff --git a/common/common.h b/common/common.h
index e0a08a61b..35f5311e1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -56,43 +56,42 @@ struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math();
- int32_t n_threads_draft = -1;
- int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
- int32_t n_threads_batch_draft = -1;
- int32_t n_predict = -1; // new tokens to predict
- int32_t n_ctx = 0; // context size
- int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
- int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
- int32_t n_keep = 0; // number of tokens to keep from initial prompt
- int32_t n_draft = 5; // number of tokens to draft during speculative decoding
- int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
- int32_t n_parallel = 1; // number of parallel sequences to decode
- int32_t n_sequences = 1; // number of sequences to decode
- float p_split = 0.1f; // speculative decoding split probability
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
- llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
- int32_t n_beams = 0; // if non-zero then use beam search of given width.
- int32_t grp_attn_n = 1; // group-attention factor
- int32_t grp_attn_w = 512; // group-attention width
- int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
- float rope_freq_base = 0.0f; // RoPE base frequency
- float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
+ int32_t n_threads_draft = -1;
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
+ int32_t n_threads_batch_draft = -1;
+ int32_t n_predict = -1; // new tokens to predict
+ int32_t n_ctx = 0; // context size
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
+ int32_t n_parallel = 1; // number of parallel sequences to decode
+ int32_t n_sequences = 1; // number of sequences to decode
+ float p_split = 0.1f; // speculative decoding split probability
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+ int32_t n_beams = 0; // if non-zero then use beam search of given width.
+ int32_t grp_attn_n = 1; // group-attention factor
+ int32_t grp_attn_w = 512; // group-attention width
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
+ float rope_freq_base = 0.0f; // RoPE base frequency
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
- int32_t yarn_orig_ctx = 0; // YaRN original context length
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
- std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
@@ -114,7 +113,9 @@ struct gpt_params {
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
std::string logits_file = ""; // file for saving *all* logits
+ std::string rpc_servers = ""; // comma separated list of RPC servers
+ std::vector in_files; // all input files
std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector kv_overrides;
@@ -124,23 +125,24 @@ struct gpt_params {
std::vector control_vectors; // control vector with user defined scale
+ int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
- int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
- int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
- // (which is more convenient to use for plotting)
- //
- bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
- size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+ // (which is more convenient to use for plotting)
+ //
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
- bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
- size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
- bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
- size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
- bool kl_divergence = false; // compute KL divergence
+ bool kl_divergence = false; // compute KL divergence
bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
@@ -163,7 +165,6 @@ struct gpt_params {
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
- bool verbose = false;
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
@@ -180,10 +181,10 @@ struct gpt_params {
std::vector image; // path to image file(s)
// server params
- int32_t port = 8080;
- int32_t timeout_read = 600;
- int32_t timeout_write = timeout_read;
- int32_t n_threads_http = -1;
+ int32_t port = 8080; // server listens on this network port
+ int32_t timeout_read = 600; // http read timeout in seconds
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests
std::string hostname = "127.0.0.1";
std::string public_path = "";
@@ -219,6 +220,16 @@ struct gpt_params {
// passkey params
int32_t n_junk = 250; // number of times to repeat the junk text
int32_t i_pos = -1; // position of the passkey in the junk text
+
+ // imatrix params
+ std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
+
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
+ int32_t i_chunk = 0; // start processing from this chunk
+
+ bool process_output = false; // collect data for the output tensor
+ bool compute_ppl = true; // whether to compute perplexity
};
void gpt_params_handle_model_default(gpt_params & params);
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index b5bc7d49b..a518b766d 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -46,8 +46,12 @@ namespace grammar_parser {
state.rules[rule_id] = rule;
}
+ static bool is_digit_char(char c) {
+ return '0' <= c && c <= '9';
+ }
+
static bool is_word_char(char c) {
- return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+ return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
}
static std::pair parse_hex(const char * src, int size) {
@@ -99,6 +103,17 @@ namespace grammar_parser {
return pos;
}
+ static const char * parse_int(const char * src) {
+ const char * pos = src;
+ while (is_digit_char(*pos)) {
+ pos++;
+ }
+ if (pos == src) {
+ throw std::runtime_error(std::string("expecting integer at ") + src);
+ }
+ return pos;
+ }
+
static std::pair parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
@@ -137,6 +152,60 @@ namespace grammar_parser {
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
+
+ auto handle_repetitions = [&](int min_times, int max_times) {
+
+ if (last_sym_start == out_elements.size()) {
+ throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
+ }
+
+ // apply transformation to previous symbol (last_sym_start to end) according to
+ // the following rewrite rules:
+ // S{m,n} --> S S S (m times) S'(n-m)
+ // S'(x) ::= S S'(x-1) |
+ // (... n-m definitions of these S' rules ...)
+ // S'(1) ::= S |
+ // S{m,} --> S S S (m times) S'
+ // S' ::= S S' |
+ // S* --> S{0,}
+ // --> S' ::= S S' |
+ // S+ --> S{1,}
+ // --> S S'
+ // S' ::= S S' |
+ // S? --> S{0,1}
+ // --> S'
+ // S' ::= S |
+
+ std::vector previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
+ if (min_times == 0) {
+ out_elements.resize(last_sym_start);
+ } else {
+ // Repeat the previous elements (min_times - 1) times
+ for (int i = 1; i < min_times; i++) {
+ out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
+ }
+ }
+
+ uint32_t last_rec_rule_id = 0;
+ auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+
+ std::vector rec_rule(previous_elements);
+ for (int i = 0; i < n_opt; i++) {
+ rec_rule.resize(previous_elements.size());
+ uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
+ if (i > 0 || max_times < 0) {
+ rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+ }
+ rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+ rec_rule.push_back({LLAMA_GRETYPE_END, 0});
+ add_rule(state, rec_rule_id, rec_rule);
+ last_rec_rule_id = rec_rule_id;
+ }
+ if (n_opt > 0) {
+ out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
+ }
+ };
+
while (*pos) {
if (*pos == '"') { // literal string
pos++;
@@ -197,40 +266,51 @@ namespace grammar_parser {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
- } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
- if (last_sym_start == out_elements.size()) {
- throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
- }
-
- // apply transformation to previous symbol (last_sym_start to end) according to
- // rewrite rules:
- // S* --> S' ::= S S' |
- // S+ --> S' ::= S S' | S
- // S? --> S' ::= S |
- uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
- std::vector sub_rule;
- // add preceding symbol to generated rule
- sub_rule.insert(
- sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
- if (*pos == '*' || *pos == '+') {
- // cause generated rule to recurse
- sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
- }
- // mark start of alternate def
- sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
- if (*pos == '+') {
- // add preceding symbol as alternate only for '+' (otherwise empty)
- sub_rule.insert(
- sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
- }
- sub_rule.push_back({LLAMA_GRETYPE_END, 0});
- add_rule(state, sub_rule_id, sub_rule);
-
- // in original rule, replace previous symbol with reference to generated rule
- out_elements.resize(last_sym_start);
- out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-
+ } else if (*pos == '.') { // any char
+ last_sym_start = out_elements.size();
+ out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == '*') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(0, -1);
+ } else if (*pos == '+') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(1, -1);
+ } else if (*pos == '?') {
+ pos = parse_space(pos + 1, is_nested);
+ handle_repetitions(0, 1);
+ } else if (*pos == '{') {
+ pos = parse_space(pos + 1, is_nested);
+
+ if (!is_digit_char(*pos)) {
+ throw std::runtime_error(std::string("expecting an int at ") + pos);
+ }
+ const char * int_end = parse_int(pos);
+ int min_times = std::stoul(std::string(pos, int_end - pos));
+ pos = parse_space(int_end, is_nested);
+
+ int max_times = -1;
+
+ if (*pos == '}') {
+ max_times = min_times;
+ pos = parse_space(pos + 1, is_nested);
+ } else if (*pos == ',') {
+ pos = parse_space(pos + 1, is_nested);
+
+ if (is_digit_char(*pos)) {
+ const char * int_end = parse_int(pos);
+ max_times = std::stoul(std::string(pos, int_end - pos));
+ pos = parse_space(int_end, is_nested);
+ }
+
+ if (*pos != '}') {
+ throw std::runtime_error(std::string("expecting '}' at ") + pos);
+ }
+ pos = parse_space(pos + 1, is_nested);
+ } else {
+ throw std::runtime_error(std::string("expecting ',' at ") + pos);
+ }
+ handle_repetitions(min_times, max_times);
} else {
break;
}
@@ -325,6 +405,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+ case LLAMA_GRETYPE_CHAR_ANY: return true;
default: return false;
}
}
@@ -339,6 +420,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
+ case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
@@ -350,6 +432,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
+ case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
@@ -407,11 +490,15 @@ namespace grammar_parser {
}
print_grammar_char(file, elem.value);
break;
+ case LLAMA_GRETYPE_CHAR_ANY:
+ fprintf(file, ".");
+ break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+ case LLAMA_GRETYPE_CHAR_ANY:
break;
default:
fprintf(file, "] ");
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 9a71f5d8d..737bae27c 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -16,58 +16,27 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
static std::string repeat(const std::string & str, size_t n);
-static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
+ auto has_max = max_items != std::numeric_limits::max();
+
+ if (min_items == 0 && max_items == 1) {
+ return item_rule + "?";
+ }
+
if (separator_rule.empty()) {
- if (min_items == 0 && max_items == 1) {
- return item_rule + "?";
- } else if (min_items == 1 && max_items == std::numeric_limits::max()) {
+ if (min_items == 1 && !has_max) {
return item_rule + "+";
- }
- }
-
- std::string result;
- if (min_items > 0) {
- if (item_rule_is_literal && separator_rule.empty()) {
- result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+ } else if (min_items == 0 && !has_max) {
+ return item_rule + "*";
} else {
- std::vector items(min_items, item_rule);
- result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+ return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
}
}
- std::function opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
- auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
-
- if (up_to_n == 0) {
- return "";
- } else if (up_to_n == 1) {
- return "(" + content + ")?";
- } else if (!separator_rule.empty() && !prefix_with_sep) {
- return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
- } else {
- std::string res = repeat("(" + content + " ", up_to_n);
- // strip trailing space
- res = res.substr(0, res.length() - 1);
- res += repeat(")?", up_to_n);
- return res;
- }
- };
-
- if (min_items > 0 && max_items != min_items) {
- result += " ";
+ auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
+ if (min_items == 0) {
+ result = "(" + result + ")?";
}
-
- if (max_items != std::numeric_limits::max()) {
- result += opt_repetitions(max_items - min_items, min_items > 0);
- } else {
- std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
- if (min_items == 0 && !separator_rule.empty()) {
- result = "(" + item_rule + " " + item_operator + "*)?";
- } else {
- result += item_operator + "*";
- }
- }
-
return result;
}
@@ -78,30 +47,24 @@ struct BuiltinRule {
std::vector deps;
};
-const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
-
std::unordered_map PRIMITIVE_RULES = {
{"boolean", {"(\"true\" | \"false\") space", {}}},
- {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
- {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+ {"decimal-part", {"[0-9]{1,16}", {}}},
+ {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
- {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
- "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
- {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
+ {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
+ {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
std::unordered_map STRING_FORMAT_RULES = {
- {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
- {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+ {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+ {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@@ -385,8 +348,7 @@ private:
sub_is_literal ? "\"" + sub + "\"" : sub,
min_times,
max_times,
- "",
- sub_is_literal
+ ""
);
seq.back().second = false;
} else {
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 84b72348d..f43b15760 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
@@ -82,6 +83,7 @@ models = [
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
+ {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
]
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index ad071b974..a86864f04 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
from __future__ import annotations
@@ -474,6 +475,9 @@ class Model:
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
+ if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
+ # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
+ res = "jina-v2-code"
if res is None:
logger.warning("\n")
@@ -2451,11 +2455,13 @@ class JinaBertV2Model(BertModel):
def get_tensors(self):
for name, data in super().get_tensors():
- if 'gated_layers' in name:
+ if 'gated_layer' in name:
d1 = data[:self.intermediate_size, :]
name1 = name.replace('gated_layers', 'gated_layers_w')
+ name1 = name1.replace('up_gated_layer', 'gated_layers_v')
d2 = data[self.intermediate_size:, :]
name2 = name.replace('gated_layers', 'gated_layers_v')
+ name2 = name2.replace('up_gated_layer', 'gated_layers_w')
yield name1, d1
yield name2, d2
continue
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index bf0125e75..4f6c3746a 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1]
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
// store key and value to memory
{
@@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
- struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
- struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
+ struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
+ struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wqb,
cur)),
n_embd/n_head, n_head, N),
- KQ_pos, n_rot, 0, 0);
+ KQ_pos, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0,
ggml_reshape_3d(ctx0,
ggml_mul_mat(ctx0,
@@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wkb,
cur)),
n_embd/n_head, n_head, N),
- KQ_pos, n_rot, 0, 0);
+ KQ_pos, n_rot, 0);
// store key and value to memory
{
diff --git a/examples/convert-legacy-llama.py b/examples/convert-legacy-llama.py
index fd8401015..721a57c00 100755
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
@@ -176,7 +176,7 @@ class Params:
rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None
f_rope_scale: float | None = None
- n_orig_ctx: int | None = None
+ n_ctx_orig: int | None = None
rope_finetuned: bool | None = None
ftype: GGMLFileType | None = None
@@ -226,7 +226,7 @@ class Params:
with open(config_path) as f:
config = json.load(f)
- rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
+ rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
rope_scaling = config.get("rope_scaling")
if rope_scaling is not None and (typ := rope_scaling.get("type")):
@@ -236,7 +236,7 @@ class Params:
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN
- n_orig_ctx = rope_scaling['original_max_position_embeddings']
+ n_ctx_orig = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
else:
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@@ -272,7 +272,7 @@ class Params:
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale,
- n_orig_ctx = n_orig_ctx,
+ n_ctx_orig = n_ctx_orig,
rope_finetuned = rope_finetuned,
)
@@ -864,8 +864,8 @@ class OutputFile:
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
- if params.n_orig_ctx is not None:
- self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
+ if params.n_ctx_orig is not None:
+ self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
if params.rope_finetuned is not None:
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 22425730f..71a4333ee 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -564,7 +564,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int rope_mode = 0;
return ggml_rope_ext(ctx,
- t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
+ t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index e04feeae3..881f0451c 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
- n_bytes = (size_t)n * 1024 * 1024; // megabytes
+ n_bytes = (size_t)n * 1000 * 1000; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
- n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
+ n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
@@ -284,7 +284,7 @@ struct split_strategy {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t);
}
- total_size = total_size / 1024 / 1024; // convert to megabytes
+ total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++;
}
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 458c01b87..866ca9f56 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
## Usage
```
-./imatrix -m -f [-o ] [--verbosity ]
- [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+./imatrix \
+ -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
+ [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
+ [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
```
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
The parameters in square brackets are optional and have the following meaning:
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
+* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
For faster computation, make sure to use GPU offloading via the `-ngl` argument
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index e050c09d2..e18f49563 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -17,39 +17,37 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s \\\n"
+ " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+ " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
+ " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
+ LOG_TEE("\n");
+}
+
struct Stats {
std::vector values;
std::vector counts;
int ncall = 0;
};
-struct StatParams {
- std::string dataset;
- std::string ofile = "imatrix.dat";
- int n_output_frequency = 10;
- int verbosity = 1;
- int keep_every = 0;
- bool collect_output_weight = false;
-};
-
class IMatrixCollector {
public:
IMatrixCollector() = default;
- void set_parameters(StatParams&& params) { m_params = std::move(params); }
+ void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
- void save_imatrix() const;
- bool load_imatrix(const char * file_name, bool add);
- static bool load_imatrix(const char * file_name, std::unordered_map& imatrix);
+ void save_imatrix(int ncall = -1) const;
+ bool load_imatrix(const char * file_name);
private:
std::unordered_map m_stats;
- StatParams m_params;
+ gpt_params m_params;
std::mutex m_mutex;
int m_last_call = 0;
std::vector m_src1_data;
std::vector m_ids; // the expert ids from ggml_mul_mat_id
- //
- void save_imatrix(const char * file_name, const char * dataset) const;
- void keep_imatrix(int ncall) const;
};
// remove any prefix and suffixes from the name
@@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (t->op != GGML_OP_MUL_MAT) return false;
// why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
- if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
+ if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
return true;
}
@@ -153,21 +151,25 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
+ if (!std::isfinite(e.values[e_start + j])) {
+ fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
+ exit(1);
+ }
}
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
- if (m_last_call % m_params.n_output_frequency == 0) {
+ if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
- if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
- keep_imatrix(m_last_call);
+ if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+ save_imatrix(m_last_call);
}
}
}
} else {
- auto& e = m_stats[wname];
+ auto & e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
@@ -185,15 +187,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
+ if (!std::isfinite(e.values[j])) {
+ fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+ exit(1);
+ }
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
- if (m_last_call % m_params.n_output_frequency == 0) {
+ if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
- if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
- keep_imatrix(m_last_call);
+ if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+ save_imatrix(m_last_call);
}
}
}
@@ -201,19 +207,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
return true;
}
-void IMatrixCollector::save_imatrix() const {
- save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
-}
+void IMatrixCollector::save_imatrix(int ncall) const {
+ auto fname = m_params.out_file;
+ if (fname.empty()) {
+ fname = "imatrix.dat";
+ }
-void IMatrixCollector::keep_imatrix(int ncall) const {
- auto file_name = m_params.ofile;
- if (file_name.empty()) file_name = "imatrix.dat";
- file_name += ".at_";
- file_name += std::to_string(ncall);
- save_imatrix(file_name.c_str(), m_params.dataset.c_str());
-}
+ if (ncall > 0) {
+ fname += ".at_";
+ fname += std::to_string(ncall);
+ }
-void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char *) &n_entries, sizeof(n_entries));
@@ -236,26 +240,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
- // Write the dataset name at the end of the file to later on specify it in quantize
- int n_dataset = strlen(dataset);
- out.write((const char *) &n_dataset, sizeof(n_dataset));
- out.write(dataset, n_dataset);
+ // Write the input filename at the end of the file to later on specify it in quantize
+ {
+ int len = m_params.prompt_file.size();
+ out.write((const char *) &len, sizeof(len));
+ out.write(m_params.prompt_file.c_str(), len);
+ }
if (m_params.verbosity > 0) {
- fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
+ fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
}
-bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map& imatrix_data) {
- std::ifstream in(imatrix_file, std::ios::binary);
+bool IMatrixCollector::load_imatrix(const char * fname) {
+ std::ifstream in(fname, std::ios::binary);
if (!in) {
- printf("%s: failed to open %s\n",__func__,imatrix_file);
+ printf("%s: failed to open %s\n",__func__, fname);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
- printf("%s: no data in file %s\n", __func__, imatrix_file);
+ printf("%s: no data in file %s\n", __func__, fname);
return false;
}
for (int i = 0; i < n_entries; ++i) {
@@ -263,23 +269,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
std::vector name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
- printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
+ printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
- auto& e = imatrix_data[std::move(name)];
+ auto & e = m_stats[std::move(name)];
int ncall;
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
- imatrix_data = {};
+ m_stats = {};
return false;
}
- // When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
@@ -289,7 +294,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
- imatrix_data = {};
+ m_stats = {};
return false;
}
@@ -304,13 +309,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
return true;
}
-bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
- if (!add) {
- m_stats.clear();
- }
- return load_imatrix(file_name, m_stats);
-}
-
static IMatrixCollector g_collector;
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -324,7 +322,7 @@ struct results_log_softmax {
float prob;
};
-static std::vector softmax(const std::vector& logits) {
+static std::vector softmax(const std::vector & logits) {
std::vector probs(logits.size());
float max_logit = logits[0];
for (float v : logits) {
@@ -358,8 +356,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers,
- double & nll, double & nll2, float * logit_history, float * prob_history
-) {
+ double & nll, double & nll2, float * logit_history, float * prob_history) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@@ -391,8 +388,7 @@ static void process_logits(
}
}
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
-
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
@@ -405,13 +401,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count());
- if (from_chunk > 0) {
- if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
- fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
+ if (params.i_chunk > 0) {
+ if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
+ fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false;
}
- fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
- tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+ fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+ tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) {
@@ -424,7 +420,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
std::vector logit_history;
std::vector prob_history;
- if (compute_ppl) {
+ if (params.compute_ppl) {
logit_history.resize(tokens.size());
prob_history.resize(tokens.size());
}
@@ -446,7 +442,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector logits;
- if (compute_ppl && num_batches > 1) {
+ if (params.compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
@@ -482,7 +478,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
- if (compute_ppl && num_batches > 1) {
+ if (params.compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
@@ -501,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
- if (compute_ppl) {
+ if (params.compute_ppl) {
const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@@ -516,7 +512,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
printf("\n");
- if (compute_ppl) {
+ if (params.compute_ppl) {
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
@@ -533,109 +529,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
int main(int argc, char ** argv) {
- StatParams sparams;
- std::string prev_result_file;
- std::string combine_files;
- bool compute_ppl = true;
- int from_chunk = 0;
- std::vector args;
- args.push_back(argv[0]);
- int iarg = 1;
- for (; iarg < argc-1; ++iarg) {
- std::string arg{argv[iarg]};
- if (arg == "-o" || arg == "--output-file") {
- sparams.ofile = argv[++iarg];
- }
- else if (arg == "-ofreq" || arg == "--output-frequency") {
- sparams.n_output_frequency = std::stoi(argv[++iarg]);
- }
- else if (arg == "-ow" || arg == "--output-weight") {
- sparams.collect_output_weight = std::stoi(argv[++iarg]);
- }
- else if (arg == "--verbosity") {
- sparams.verbosity = std::stoi(argv[++iarg]);
- } else if (arg == "--no-ppl") {
- compute_ppl = false;
- } else if (arg == "--keep-imatrix") {
- sparams.keep_every = std::stoi(argv[++iarg]);
- } else if (arg == "--continue-from") {
- prev_result_file = argv[++iarg];
- } else if (arg == "--combine") {
- combine_files = argv[++iarg];
- }
- else if (arg == "--from-chunk") {
- from_chunk = std::stoi(argv[++iarg]);
- } else {
- args.push_back(argv[iarg]);
- }
- }
- if (iarg < argc) {
- std::string arg{argv[iarg]};
- if (arg == "--no-ppl") {
- compute_ppl = false;
- } else {
- args.push_back(argv[iarg]);
- }
- }
-
gpt_params params;
- params.n_batch = 512;
+
+ params.n_ctx = 512;
+ params.logits_all = true;
+ params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params)) {
- gpt_params_print_usage(argc, argv, params);
+ print_usage(argc, argv, params);
return 1;
}
- params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
- print_build_info();
+ g_collector.set_params(params);
- if (params.seed == LLAMA_DEFAULT_SEED) {
- params.seed = time(NULL);
- }
-
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
-
- std::mt19937 rng(params.seed);
-
- sparams.dataset = params.prompt_file;
- g_collector.set_parameters(std::move(sparams));
-
- if (!combine_files.empty()) {
- std::vector files;
- size_t pos = 0;
- while (true) {
- auto new_pos = combine_files.find(',', pos);
- if (new_pos != std::string::npos) {
- files.emplace_back(combine_files.substr(pos, new_pos - pos));
- pos = new_pos + 1;
- } else {
- files.emplace_back(combine_files.substr(pos));
- break;
- }
- }
- if (files.size() < 2) {
- fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
+ for (const auto & in_file : params.in_files) {
+ printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+ if (!g_collector.load_imatrix(in_file.c_str())) {
+ fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
return 1;
}
- printf("Combining the following %d files\n", int(files.size()));
- for (auto& file : files) {
- printf(" %s\n", file.c_str());
- if (!g_collector.load_imatrix(file.c_str(), true)) {
- fprintf(stderr, "Failed to load %s\n", file.c_str());
- return 1;
- }
- }
+ }
+
+ if (params.in_files.size() > 1) {
+ printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix();
- return 0;
- }
-
- if (!prev_result_file.empty()) {
- if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
- fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
- return 1;
- }
}
llama_backend_init();
@@ -650,6 +569,7 @@ int main(int argc, char ** argv) {
// init
llama_model * model;
llama_context * ctx;
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
@@ -668,8 +588,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
- bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
- if (!OK) {
+ if (!compute_imatrix(ctx, params)) {
return 1;
}
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 826cd3f72..7d889c3fe 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -6,52 +6,22 @@ import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
-def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
+
+ if min_items == 0 and max_items == 1:
+ return f'{item_rule}?'
+
if not separator_rule:
- if min_items == 0 and max_items == 1:
- return f'{item_rule}?'
- elif min_items == 1 and max_items is None:
+ if min_items == 1 and max_items is None:
return f'{item_rule}+'
-
- result = ''
-
- if min_items > 0:
- if item_rule_is_literal and separator_rule is None:
- result = '"' + (item_rule[1:-1] * min_items) + '"'
+ elif min_items == 0 and max_items is None:
+ return f'{item_rule}*'
else:
- result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+ return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
- def opt_repetitions(up_to_n, prefix_with_sep=False):
- '''
- - n=4, no sep: '(a (a (a (a)?)?)?)?'
- - n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
- - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
- '''
-
- content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
- if up_to_n == 0:
- return ''
- elif up_to_n == 1:
- return f'({content})?'
- elif separator_rule and not prefix_with_sep:
- return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
- else:
- return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
-
- if min_items > 0 and max_items != min_items:
- result += ' '
-
- if max_items is not None:
- result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
- else:
- item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
-
- if min_items == 0 and separator_rule:
- result = f'({item_rule} {item_operator}*)?'
- else:
- result += f'{item_operator}*'
-
- return result
+ result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
+ return f'({result})?' if min_items == 0 else result
class BuiltinRule:
@@ -59,31 +29,29 @@ class BuiltinRule:
self.content = content
self.deps = deps or []
-_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
-
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),
- 'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
- 'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+ 'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
+ 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
- 'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
- 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+ 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
+ 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
}
# TODO: support "uri", "email" string formats
STRING_FORMAT_RULES = {
- 'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
- 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+ 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+ 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
@@ -333,7 +301,7 @@ class SchemaConverter:
sub_rule_ids[sub] = id
sub = id
- seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
+ seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
else:
literal = ''
while i < length:
diff --git a/examples/main/README.md b/examples/main/README.md
index 4eaa68475..cdc002f15 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -69,7 +69,6 @@ In this section, we cover the most commonly used options for running the `main`
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
-- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
@@ -83,7 +82,7 @@ The `main` program provides several ways to interact with the LLaMA models using
## Interaction
-The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
@@ -91,7 +90,6 @@ In interactive mode, users can participate in text generation by injecting their
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
-- `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@@ -120,16 +118,6 @@ The `--in-suffix` flag is used to add a suffix after your input. This is useful
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
```
-### Instruction Mode
-
-Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
-
-- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
-
-Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
-
-By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
-
## Context Management
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 9acc7cc6d..f029c73a2 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -624,7 +624,7 @@ string ::= "\"" (
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" ws
ws ::= ([ \t\n] ws)?
-float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
integer ::= [0-9]+"""
diff --git a/examples/server/README.md b/examples/server/README.md
index 0c3db8c84..ccbdcdbdb 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -279,7 +279,7 @@ node index.js
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
- `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false`
+ `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index 8e0be1b40..cef11eab8 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -2,57 +2,26 @@
const SPACE_RULE = '" "?';
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
+ if (minItems === 0 && maxItems === 1) {
+ return `${itemRule}?`;
+ }
+
+
const separatorRule = opts.separatorRule ?? '';
const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
if (separatorRule === '') {
- if (minItems === 0 && maxItems === 1) {
- return `${itemRule}?`;
- } else if (minItems === 1 && maxItems === undefined) {
+ if (minItems === 1 && maxItems === undefined) {
return `${itemRule}+`;
- }
- }
-
- let result = '';
- if (minItems > 0) {
- if (itemRuleIsLiteral && separatorRule === '') {
- result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
+ } else if (minItems === 0 && maxItems === undefined) {
+ return `${itemRule}*`;
} else {
- result = Array.from({ length: minItems }, () => itemRule)
- .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
+ return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
}
}
- const optRepetitions = (upToN, prefixWithSep=false) => {
- const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
- if (upToN === 0) {
- return '';
- } else if (upToN === 1) {
- return `(${content})?`;
- } else if (separatorRule !== '' && !prefixWithSep) {
- return `(${content} ${optRepetitions(upToN - 1, true)})?`;
- } else {
- return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
- }
- };
-
- if (minItems > 0 && maxItems !== minItems) {
- result += ' ';
- }
-
- if (maxItems !== undefined) {
- result += optRepetitions(maxItems - minItems, minItems > 0);
- } else {
- const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
-
- if (minItems === 0 && separatorRule !== '') {
- result = `(${itemRule} ${itemOperator}*)?`;
- } else {
- result += `${itemOperator}*`;
- }
- }
-
- return result;
+ const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
+ return minItems === 0 ? `(${result})?` : result;
}
class BuiltinRule {
@@ -62,27 +31,25 @@ class BuiltinRule {
}
}
-const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
-
const PRIMITIVE_RULES = {
boolean : new BuiltinRule('("true" | "false") space', []),
- 'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
- 'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
+ 'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
+ 'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
- uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
- char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
+ uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
+ char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null : new BuiltinRule('"null" space', []),
};
// TODO: support "uri", "email" string formats
const STRING_FORMAT_RULES = {
- 'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
- 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+ 'date' : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+ 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : new BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']),
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d581cad95..528220607 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -888,7 +888,7 @@ struct server_context {
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
// get prompt
- {
+ if (!task.infill) {
const auto & prompt = data.find("prompt");
if (prompt == data.end()) {
send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@@ -2360,7 +2360,7 @@ int main(int argc, char ** argv) {
// TODO: not great to use extern vars
server_log_json = params.log_json;
- server_verbose = params.verbose;
+ server_verbose = params.verbosity > 0;
// struct that contains llama context and inference
server_context ctx_server;
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index e2f85c682..b779f6bd4 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -302,7 +302,7 @@ static struct ggml_tensor * llama_build_train_graphs(
const int rope_mode = 0;
return ggml_rope_ext(
- ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+ ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};
diff --git a/ggml-common.h b/ggml-common.h
index 77e6bfba4..e8efceb76 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -123,12 +123,18 @@ typedef sycl::half2 ggml_half2;
#define QI1_S (QK_K / (4*QR1_S))
#define QR1_S 8
+#define QI1_M (QK_K / (4*QR1_M))
+#define QR1_M 8
+
#define QI4_NL (QK4_NL / (4*QR4_NL))
#define QR4_NL 2
#define QI4_XS (QK_K / (4*QR4_XS))
#define QR4_XS 8
+#define QI3_S (QK_K / (4*QR3_S))
+#define QR3_S 8
+
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
#define QK4_0 32
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c2c9940bf..c8af1944e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -625,88 +625,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
// cuda split buffer
-static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) {
- int64_t min_compute_capability = INT_MAX;
- int64_t max_compute_capability = INT_MIN;
+static int64_t get_row_rounding(const std::array & tensor_split) {
+ int64_t row_rounding = 0;
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
- if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
- if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
- min_compute_capability = ggml_cuda_info().devices[id].cc;
- }
- if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
- max_compute_capability = ggml_cuda_info().devices[id].cc;
- }
+ if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
+ continue;
}
- }
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
- switch(type) {
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1:
- case GGML_TYPE_Q5_0:
- case GGML_TYPE_Q5_1:
- case GGML_TYPE_Q8_0:
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
- case GGML_TYPE_F16:
- case GGML_TYPE_F32:
- return 1;
- case GGML_TYPE_Q2_K:
- return max_compute_capability >= CC_RDNA2 ? 128 : 32;
- case GGML_TYPE_Q3_K:
- return min_compute_capability < CC_RDNA2 ? 128 : 64;
- case GGML_TYPE_Q4_K:
- case GGML_TYPE_Q5_K:
- case GGML_TYPE_Q6_K:
- case GGML_TYPE_IQ2_XXS:
- case GGML_TYPE_IQ2_XS:
- case GGML_TYPE_IQ2_S:
- case GGML_TYPE_IQ3_XXS:
- case GGML_TYPE_IQ1_S:
- case GGML_TYPE_IQ1_M:
- case GGML_TYPE_IQ4_NL:
- case GGML_TYPE_IQ4_XS:
- case GGML_TYPE_IQ3_S:
- return max_compute_capability >= CC_RDNA2 ? 128 : 64;
- default:
- GGML_ASSERT(false);
+ const int cc = ggml_cuda_info().devices[id].cc;
+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
}
-#else
- switch(type) {
- case GGML_TYPE_Q4_0:
- case GGML_TYPE_Q4_1:
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
- case GGML_TYPE_Q5_0:
- case GGML_TYPE_Q5_1:
- case GGML_TYPE_Q8_0:
- return 64;
- case GGML_TYPE_F16:
- case GGML_TYPE_F32:
- return 1;
- case GGML_TYPE_Q2_K:
- case GGML_TYPE_Q3_K:
- case GGML_TYPE_Q4_K:
- case GGML_TYPE_Q5_K:
- case GGML_TYPE_IQ2_XXS:
- case GGML_TYPE_IQ2_XS:
- case GGML_TYPE_IQ2_S:
- case GGML_TYPE_IQ3_XXS:
- case GGML_TYPE_IQ1_S:
- case GGML_TYPE_IQ1_M:
- case GGML_TYPE_IQ4_NL:
- case GGML_TYPE_IQ4_XS:
- case GGML_TYPE_IQ3_S:
- return max_compute_capability >= CC_VOLTA ? 128 : 64;
- case GGML_TYPE_Q6_K:
- return 64;
- default:
- GGML_ASSERT(false);
- }
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+ return row_rounding;
}
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) {
const int64_t nrows = ggml_nrows(tensor);
- const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
+ const int64_t rounding = get_row_rounding(tensor_split);
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
*row_low -= *row_low % rounding;
@@ -1487,7 +1421,7 @@ static void ggml_cuda_op_mul_mat(
// for multi GPU, get the row boundaries from tensor split
// and round to mul_mat_q tile sizes
if (split) {
- const int64_t rounding = get_row_rounding(src0->type, tensor_split);
+ const int64_t rounding = get_row_rounding(tensor_split);
if (id != 0) {
dev[id].row_low = ne01*tensor_split[id];
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
index 22872ca5c..90a0a81ea 100644
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@@ -160,7 +160,7 @@
#endif
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
-#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
@@ -484,6 +484,161 @@ static __device__ __forceinline__ float get_alibi_slope(
return powf(base, exph);
}
+template
+struct ggml_cuda_type_traits;
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = 1;
+ static constexpr int qr = 1;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK4_0;
+ static constexpr int qr = QR4_0;
+ static constexpr int qi = QI4_0;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK4_1;
+ static constexpr int qr = QR4_1;
+ static constexpr int qi = QI4_1;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK5_0;
+ static constexpr int qr = QR5_0;
+ static constexpr int qi = QI5_0;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK5_1;
+ static constexpr int qr = QR5_1;
+ static constexpr int qi = QI5_1;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK8_0;
+ static constexpr int qr = QR8_0;
+ static constexpr int qi = QI8_0;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR2_K;
+ static constexpr int qi = QI2_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR3_K;
+ static constexpr int qi = QI3_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR4_K;
+ static constexpr int qi = QI4_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR5_K;
+ static constexpr int qi = QI5_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR6_K;
+ static constexpr int qi = QI6_K;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR2_XXS;
+ static constexpr int qi = QI2_XXS;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR2_XS;
+ static constexpr int qi = QI2_XS;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR2_S;
+ static constexpr int qi = QI2_S;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR3_XXS;
+ static constexpr int qi = QI3_XXS;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR1_S;
+ static constexpr int qi = QI1_S;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR1_M;
+ static constexpr int qi = QI1_M;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK4_NL;
+ static constexpr int qr = QR4_NL;
+ static constexpr int qi = QI4_NL;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR4_XS;
+ static constexpr int qi = QI4_XS;
+};
+
+template<>
+struct ggml_cuda_type_traits {
+ static constexpr int qk = QK_K;
+ static constexpr int qr = QR3_S;
+ static constexpr int qi = QI3_S;
+};
+
+static int get_mmq_x_max_host(const int cc) {
+#ifdef CUDA_USE_TENSOR_CORES
+ return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
+#else
+ return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
+#endif // CUDA_USE_TENSOR_CORES
+}
+
+// Round rows to this value for --split-mode row:
+static int get_mmq_y_host(const int cc, const int mmq_x) {
+ return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
+}
+
//////////////////////
struct ggml_cuda_device_info {
diff --git a/ggml-cuda/dmmv.cu b/ggml-cuda/dmmv.cu
index 47d4d5d9e..174489e06 100644
--- a/ggml-cuda/dmmv.cu
+++ b/ggml-cuda/dmmv.cu
@@ -422,10 +422,22 @@ static __device__ void convert_f16(const void * vx, const int64_t ib, const int
v.y = x[ib + iqs + 1];
}
-template
+static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
+ return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
+ type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
+ type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
+ type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
+ type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
+ type == GGML_TYPE_F16 ? convert_f16 :
+ nullptr;
+}
+
+template
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
- // qk = quantized weights per x block
- // qr = number of quantized weights per data value in x block
+ constexpr int qk = ggml_cuda_type_traits::qk; // quantized weights per x block
+ constexpr int qr = ggml_cuda_type_traits::qr; // number of quantized weights per data value in x block
+ constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
+
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
if (row >= nrows) {
@@ -493,7 +505,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
- dequantize_mul_mat_vec
+ dequantize_mul_mat_vec
<<>>(vx, y, dst, ncols, nrows);
}
@@ -502,7 +514,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
- dequantize_mul_mat_vec
+ dequantize_mul_mat_vec
<<>>(vx, y, dst, ncols, nrows);
}
@@ -511,7 +523,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
- dequantize_mul_mat_vec
+ dequantize_mul_mat_vec
<<>>(vx, y, dst, ncols, nrows);
}
@@ -520,7 +532,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
- dequantize_mul_mat_vec
+ dequantize_mul_mat_vec
<<>>(vx, y, dst, ncols, nrows);
}
@@ -529,7 +541,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y,
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
- dequantize_mul_mat_vec
+ dequantize_mul_mat_vec
<<>>(vx, y, dst, ncols, nrows);
}
@@ -580,7 +592,7 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
- dequantize_mul_mat_vec<1, 1, convert_f16>
+ dequantize_mul_mat_vec
<<>>(vx, y, dst, ncols, nrows);
}
diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu
index ebe1dc5c8..58799e4ca 100644
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
@@ -1,1450 +1,4 @@
#include "mmq.cuh"
-#include "vecdotq.cuh"
-
-typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
-typedef void (*load_tiles_cuda_t)(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_cuda_t)(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
-typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
-typedef void (mul_mat_q_t)(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst);
-
-struct mmq_arch_config_t {
- int x;
- int y;
- int nwarps;
-};
-
-struct mmq_config_t {
- mmq_arch_config_t rdna2;
- mmq_arch_config_t rdna1;
- mmq_arch_config_t ampere;
- mmq_arch_config_t pascal;
-};
-
-constexpr mmq_config_t MMQ_CONFIG_Q4_0 = {
-// x y nwarps
- { 64, 128, 8},
- { 64, 64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q4_1 = {
-// x y nwarps
- { 64, 128, 8},
- { 64, 64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q5_0 = {
-// x y nwarps
- { 64, 128, 8},
- { 64, 64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- {128, 64, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q5_1 = {
-// x y nwarps
- { 64, 128, 8},
- { 64, 64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- {128, 64, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q8_0 = {
-// x y nwarps
- { 64, 128, 8},
- { 64, 64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- {128, 64, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q2_K = {
-// x y nwarps
- { 64, 128, 8},
- {128, 32, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q3_K = {
-// x y nwarps
- {128, 64, 8},
- { 32, 128, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- {128, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q4_K = {
-// x y nwarps
- { 64, 128, 8},
- { 32, 64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q5_K = {
-// x y nwarps
- { 64, 128, 8},
- { 32, 64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q6_K = {
-// x y nwarps
- { 64, 128, 8},
- { 32, 64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
- { 4, 32, 4},
-#else
- { 64, 64, 4},
-#endif // CUDA_USE_TENSOR_CORES
- { 64, 64, 8},
-};
-
-// ------------------------------------------------------------
-
-template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh);
- GGML_UNUSED(x_sc);
-
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
-
- *x_ql = tile_x_qs;
- *x_dm = (half2 *) tile_x_d;
-}
-
-template static __device__ __forceinline__ void load_tiles_q4_0(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI4_0;
- const int kqsx = k % QI4_0;
-
- const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-
- float * x_dmf = (float *) x_dm;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
- // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
- const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
- int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
- const float * x_dmf = (const float *) x_dm;
-
- int u[2*VDR_Q4_0_Q8_1_MMQ];
-
-#pragma unroll
- for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
- }
-
- return vec_dot_q4_0_q8_1_impl
- (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
- y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
-
- *x_ql = tile_x_qs;
- *x_dm = tile_x_dm;
-}
-
-template static __device__ __forceinline__ void load_tiles_q4_1(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI4_1;
- const int kqsx = k % QI4_1;
-
- const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
- const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
- int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-
- int u[2*VDR_Q4_1_Q8_1_MMQ];
-
-#pragma unroll
- for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
- }
-
- return vec_dot_q4_1_q8_1_impl
- (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
- y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
-
- *x_ql = tile_x_ql;
- *x_dm = (half2 *) tile_x_d;
-}
-
-template static __device__ __forceinline__ void load_tiles_q5_0(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI5_0;
- const int kqsx = k % QI5_0;
-
- const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
- const int ql = get_int_from_uint8(bxi->qs, kqsx);
- const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
-
- int qs0 = (ql >> 0) & 0x0F0F0F0F;
- qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
- qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
- qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
- qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
- qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
-
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
- int qs1 = (ql >> 4) & 0x0F0F0F0F;
- qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
- qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
- qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
- qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
- qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
-
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
- const int kbxd = k % blocks_per_tile_x_row;
- float * x_dmf = (float *) x_dm;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
- int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
- const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
- const float * x_dmf = (const float *) x_dm;
- const float * y_df = (const float *) y_ds;
-
- int u[2*VDR_Q5_0_Q8_1_MMQ];
-
-#pragma unroll
- for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
- }
-
- return vec_dot_q8_0_q8_1_impl
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-
-template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
-
- *x_ql = tile_x_ql;
- *x_dm = tile_x_dm;
-}
-
-template static __device__ __forceinline__ void load_tiles_q5_1(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI5_1;
- const int kqsx = k % QI5_1;
-
- const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
- const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
- const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
-
- int qs0 = (ql >> 0) & 0x0F0F0F0F;
- qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
- qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
- qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
- qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
-
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
- int qs1 = (ql >> 4) & 0x0F0F0F0F;
- qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
- qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
- qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
- qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
-
- x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
- const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
- int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
- const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
-
- int u[2*VDR_Q5_1_Q8_1_MMQ];
-
-#pragma unroll
- for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
- u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
- u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
- }
-
- return vec_dot_q8_1_q8_1_impl
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
- __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
-
- *x_ql = tile_x_qs;
- *x_dm = (half2 *) tile_x_d;
-}
-
-template static __device__ __forceinline__ void load_tiles_q8_0(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI8_0;
- const int kqsx = k % QI8_0;
- float * x_dmf = (float *) x_dm;
-
- const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
- const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
- int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
- const float * x_dmf = (const float *) x_dm;
- const float * y_df = (const float *) y_ds;
-
- return vec_dot_q8_0_q8_1_impl
- (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
- y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
-}
-
-template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh);
-
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
-
- *x_ql = tile_x_ql;
- *x_dm = tile_x_dm;
- *x_sc = tile_x_sc;
-}
-
-template static __device__ __forceinline__ void load_tiles_q2_K(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh);
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI2_K;
- const int kqsx = k % QI2_K;
-
- const block_q2_K * bx0 = (const block_q2_K *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
- const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
- int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
- }
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh);
-
- const int kbx = k / QI2_K;
- const int ky = (k % QI2_K) * QR2_K;
- const float * y_df = (const float *) y_ds;
-
- int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-
- const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
- const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-
-#pragma unroll
- for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
- v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
- }
-
- const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-
- const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
- return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-}
-
-template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
- __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
-
- *x_ql = tile_x_ql;
- *x_dm = tile_x_dm;
- *x_qh = tile_x_qh;
- *x_sc = tile_x_sc;
-}
-
-template static __device__ __forceinline__ void load_tiles_q3_K(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI3_K;
- const int kqsx = k % QI3_K;
-
- const block_q3_K * bx0 = (const block_q3_K *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
- const int kbxd = k % blocks_per_tile_x_row;
- float * x_dmf = (float *) x_dm;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
- int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
- }
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
- int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-
- // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
- }
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-
- const int ksc = k % (QI3_K/4);
-
- const int ksc_low = ksc % (QI3_K/8);
- const int shift_low = 4 * (ksc / (QI3_K/8));
- const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-
- const int ksc_high = QI3_K/8;
- const int shift_high = 2 * ksc;
- const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-
- const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
-
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-
- const int kbx = k / QI3_K;
- const int ky = (k % QI3_K) * QR3_K;
- const float * x_dmf = (const float *) x_dm;
- const float * y_df = (const float *) y_ds;
-
- const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-
- int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-
-#pragma unroll
- for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
- const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
- const int shift = 2 * ((ky % 32) / 8);
- const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-
- const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
- const int vlh = (vh << 2) & 0x04040404;
-
- v[l] = __vsubss4(vll, vlh);
- }
-
- const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
- return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-}
-
-template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh);
-
- __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
-
- *x_ql = tile_x_ql;
- *x_dm = tile_x_dm;
- *x_sc = tile_x_sc;
-}
-
-template static __device__ __forceinline__ void load_tiles_q4_K(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh);
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI4_K; // == 0 if QK_K == 256
- const int kqsx = k % QI4_K; // == k if QK_K == 256
-
- const block_q4_K * bx0 = (const block_q4_K *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
- int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
- }
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-
- const int * scales = (const int *) bxi->scales;
-
- const int ksc = k % (WARP_SIZE/8);
-
- // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
- int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
- scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
-
- x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh);
-
- const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-
- const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
- return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
- x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-}
-
-template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh);
-
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
-
- *x_ql = tile_x_ql;
- *x_dm = tile_x_dm;
- *x_sc = tile_x_sc;
-}
-
-template static __device__ __forceinline__ void load_tiles_q5_K(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh);
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI5_K; // == 0 if QK_K == 256
- const int kqsx = k % QI5_K; // == k if QK_K == 256
-
- const block_q5_K * bx0 = (const block_q5_K *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
- const int ky = QR5_K*kqsx;
-
- const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
- const int ql0 = (ql >> 0) & 0x0F0F0F0F;
- const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
- const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
- const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
- const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-
- const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
- const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-
- x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
- x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
- int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
- }
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-
- const int * scales = (const int *) bxi->scales;
-
- const int ksc = k % (WARP_SIZE/8);
-
- // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
- int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
- scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
-
- x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh);
-
- const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-
- const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
- const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
- return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
- x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-}
-
-template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
- GGML_UNUSED(x_qh);
-
- __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
- __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
- __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
-
- *x_ql = tile_x_ql;
- *x_dm = tile_x_dm;
- *x_sc = tile_x_sc;
-}
-
-template static __device__ __forceinline__ void load_tiles_q6_K(
- const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
- int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
- GGML_UNUSED(x_qh);
-
- GGML_CUDA_ASSUME(i_offset >= 0);
- GGML_CUDA_ASSUME(i_offset < nwarps);
- GGML_CUDA_ASSUME(k >= 0);
- GGML_CUDA_ASSUME(k < WARP_SIZE);
-
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
- const int kqsx = k % QI6_K; // == k if QK_K == 256
-
- const block_q6_K * bx0 = (const block_q6_K *) vx;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
- int i = i0 + i_offset;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
- const int ky = QR6_K*kqsx;
-
- const int ql = get_int_from_uint8(bxi->ql, kqsx);
- const int ql0 = (ql >> 0) & 0x0F0F0F0F;
- const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
- const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
- const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
- const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
-
- const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
- const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-
- x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
- x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
- }
-
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
- float * x_dmf = (float *) x_dm;
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
- int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
- x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
- }
-
-#pragma unroll
- for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
- if (need_check) {
- i = min(i, i_max);
- }
-
- const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
-
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
- }
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
- GGML_UNUSED(x_qh);
-
- const float * x_dmf = (const float *) x_dm;
- const float * y_df = (const float *) y_ds;
-
- const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-
- const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
- const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
- return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-}
-
-template
-static __device__ __forceinline__ void mul_mat_q(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
- const block_q_t * x = (const block_q_t *) vx;
- const block_q8_1 * y = (const block_q8_1 *) vy;
-
- const int blocks_per_row_x = ncols_x / qk;
- const int blocks_per_col_y = nrows_y / QK8_1;
- const int blocks_per_warp = WARP_SIZE / qi;
-
- const int & ncols_dst = ncols_y;
-
- const int row_dst_0 = blockIdx.x*mmq_y;
- const int & row_x_0 = row_dst_0;
-
- const int col_dst_0 = blockIdx.y*mmq_x;
- const int & col_y_0 = col_dst_0;
-
- int * tile_x_ql = nullptr;
- half2 * tile_x_dm = nullptr;
- int * tile_x_qh = nullptr;
- int * tile_x_sc = nullptr;
-
- allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
- __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
- __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
-
- float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
-
- for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-
- load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
- threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
-
-#pragma unroll
- for (int ir = 0; ir < qr; ++ir) {
- const int kqs = ir*WARP_SIZE + threadIdx.x;
- const int kbxd = kqs / QI8_1;
-
-#pragma unroll
- for (int i = 0; i < mmq_x; i += nwarps) {
- const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
-
- const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-
- const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
- tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
- }
-
-#pragma unroll
- for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
- const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
- const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
- const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
-
- // if the sum is not needed it's faster to transform the scale to f32 ahead of time
- const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
- half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
- if (need_sum) {
- *dsi_dst = *dsi_src;
- } else {
- float * dfi_dst = (float *) dsi_dst;
- *dfi_dst = __low2float(*dsi_src);
- }
- }
-
- __syncthreads();
-
-// #pragma unroll // unrolling this loop causes too much register pressure
- for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
-#pragma unroll
- for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
- for (int i = 0; i < mmq_y; i += WARP_SIZE) {
- sum[i/WARP_SIZE][j/nwarps] += vec_dot(
- tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
- threadIdx.x + i, threadIdx.y + j, k);
- }
- }
- }
-
- __syncthreads();
- }
- }
-
-#pragma unroll
- for (int j = 0; j < mmq_x; j += nwarps) {
- const int col_dst = col_dst_0 + j + threadIdx.y;
-
- if (col_dst >= ncols_dst) {
- return;
- }
-
-#pragma unroll
- for (int i = 0; i < mmq_y; i += WARP_SIZE) {
- const int row_dst = row_dst_0 + threadIdx.x + i;
-
- if (row_dst >= nrows_dst) {
- continue;
- }
-
- dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
- }
- }
-}
-
-static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-
-#if defined(RDNA3) || defined(RDNA2)
- return mmq_config.rdna2;
-#else
- return mmq_config.rdna1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-#else
-
-#if __CUDA_ARCH__ >= CC_VOLTA
- return mmq_config.ampere;
-#else
- return mmq_config.pascal;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
- mul_mat_q4_0(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0);
-
- mul_mat_q,
- load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
- mul_mat_q4_1(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1);
-
- mul_mat_q,
- load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
- mul_mat_q5_0(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0);
-
- mul_mat_q,
- load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_1(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1);
-
- mul_mat_q,
- load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
- mul_mat_q8_0(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
-
- mul_mat_q,
- load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q2_K(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K);
-
- mul_mat_q,
- load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
- mul_mat_q3_K(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K);
-
- mul_mat_q,
- load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
- mul_mat_q4_K(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K);
-
- mul_mat_q,
- load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_K(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K);
-
- mul_mat_q,
- load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
- __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
- mul_mat_q6_K(
- const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
- const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
- constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K);
-
- mul_mat_q,
- load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
- (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
- GGML_UNUSED(get_arch_config_device);
- GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
- NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define MMQ_SWITCH_CASE(type_suffix) \
- case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) { \
- const bool need_check = false; \
- mul_mat_q##type_suffix<<>> \
- (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
- } else { \
- const bool need_check = true; \
- mul_mat_q##type_suffix<<>> \
- (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
- } break; \
void ggml_cuda_op_mul_mat_q(
ggml_backend_cuda_context & ctx,
@@ -1454,12 +8,15 @@ void ggml_cuda_op_mul_mat_q(
const int64_t ne00 = src0->ne[0];
+ const int64_t nb01 = src0->nb[1];
+
const int64_t ne10 = src1->ne[0];
GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
+ const int64_t stride00 = nb01 / ggml_type_size(src0->type);
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
@@ -1468,78 +25,44 @@ void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
- mmq_config_t mmq_config;
+ const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
switch (src0->type) {
case GGML_TYPE_Q4_0:
- mmq_config = MMQ_CONFIG_Q4_0;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q4_1:
- mmq_config = MMQ_CONFIG_Q4_1;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q5_0:
- mmq_config = MMQ_CONFIG_Q5_0;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q5_1:
- mmq_config = MMQ_CONFIG_Q5_1;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q8_0:
- mmq_config = MMQ_CONFIG_Q8_0;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q2_K:
- mmq_config = MMQ_CONFIG_Q2_K;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q3_K:
- mmq_config = MMQ_CONFIG_Q3_K;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q4_K:
- mmq_config = MMQ_CONFIG_Q4_K;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q5_K:
- mmq_config = MMQ_CONFIG_Q5_K;
+ mul_mat_q_case(args, stream);
break;
case GGML_TYPE_Q6_K:
- mmq_config = MMQ_CONFIG_Q6_K;
+ mul_mat_q_case(args, stream);
break;
default:
GGML_ASSERT(false);
break;
}
- mmq_arch_config_t arch_config;
- if (compute_capability >= CC_RDNA2) {
- arch_config = mmq_config.rdna2;
- } else if (compute_capability >= CC_OFFSET_AMD) {
- arch_config = mmq_config.rdna1;
- } else if (compute_capability >= CC_VOLTA) {
- arch_config = mmq_config.ampere;
- } else if (compute_capability >= MIN_CC_DP4A) {
- arch_config = mmq_config.pascal;
- } else {
- GGML_ASSERT(false);
- }
-
- const int block_num_x = (row_diff + arch_config.y - 1) / arch_config.y;
- const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x;
- const dim3 block_nums(block_num_x, block_num_y, 1);
- const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1);
-
- switch (src0->type) {
- MMQ_SWITCH_CASE(4_0)
- MMQ_SWITCH_CASE(4_1)
- MMQ_SWITCH_CASE(5_0)
- MMQ_SWITCH_CASE(5_1)
- MMQ_SWITCH_CASE(8_0)
- MMQ_SWITCH_CASE(2_K)
- MMQ_SWITCH_CASE(3_K)
- MMQ_SWITCH_CASE(4_K)
- MMQ_SWITCH_CASE(5_K)
- MMQ_SWITCH_CASE(6_K)
- default:
- GGML_ASSERT(false);
- break;
- }
-
GGML_UNUSED(src1);
GGML_UNUSED(dst);
GGML_UNUSED(src1_ddf_i);
diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh
index 807817c4a..6744cce6d 100644
--- a/ggml-cuda/mmq.cuh
+++ b/ggml-cuda/mmq.cuh
@@ -1,4 +1,1304 @@
#include "common.cuh"
+#include "vecdotq.cuh"
+
+#include
+#include
+
+typedef void (*load_tiles_mmq_t)(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride);
+typedef void (*vec_dot_mmq_t)(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0);
+
+struct tile_x_sizes {
+ int ql;
+ int dm;
+ int qh;
+ int sc;
+};
+
+// get_mmq_x_max_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
+
+static constexpr __device__ int get_mmq_x_max_device() {
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+ return 64;
+#else
+#if __CUDA_ARCH__ >= CC_VOLTA
+#ifdef CUDA_USE_TENSOR_CORES
+ return MMQ_MAX_BATCH_SIZE;
+#else
+ return 128;
+#endif // CUDA_USE_TENSOR_CORES
+#else
+ return 64;
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+}
+
+// get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+static constexpr __device__ int get_mmq_y_device(int mmq_x) {
+ return mmq_x >= 32 ? 128 : 64;
+}
+#else
+#if __CUDA_ARCH__ >= CC_VOLTA
+static constexpr __device__ int get_mmq_y_device(int mmq_x) {
+ return mmq_x >= 32 ? 128 : 64;
+}
+#else
+static constexpr __device__ int get_mmq_y_device(int /*mmq_x*/) {
+ return 64;
+}
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+
+#define TILE_X_SIZES_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0, 0}
+#define TILE_X_SIZES_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_1 + mmq_y/QI4_1, 0, 0}
+#define TILE_X_SIZES_Q5_0 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_0 + mmq_y/QI5_0, 0, 0}
+#define TILE_X_SIZES_Q5_1 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_1 + mmq_y/QI5_1, 0, 0}
+#define TILE_X_SIZES_Q8_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI8_0 + mmq_y/QI8_0, 0, 0}
+#define TILE_X_SIZES_Q2_K tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI2_K + mmq_y/QI2_K, 0, mmq_y*WARP_SIZE/4 + mmq_y/4}
+#define TILE_X_SIZES_Q3_K tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI3_K + mmq_y/QI3_K, mmq_y*WARP_SIZE/2 + mmq_y/2, mmq_y*WARP_SIZE/4 + mmq_y/4}
+#define TILE_X_SIZES_Q4_K tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_K + mmq_y/QI4_K, 0, mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define TILE_X_SIZES_Q5_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_K + mmq_y/QI5_K, 0, mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define TILE_X_SIZES_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, 0, mmq_y*WARP_SIZE/8 + mmq_y/8}
+
+#define GET_TILE_X_SIZES_BODY \
+ return type == GGML_TYPE_Q4_0 ? TILE_X_SIZES_Q4_0 : \
+ type == GGML_TYPE_Q4_1 ? TILE_X_SIZES_Q4_1 : \
+ type == GGML_TYPE_Q5_0 ? TILE_X_SIZES_Q5_0 : \
+ type == GGML_TYPE_Q5_1 ? TILE_X_SIZES_Q5_1 : \
+ type == GGML_TYPE_Q8_0 ? TILE_X_SIZES_Q8_0 : \
+ type == GGML_TYPE_Q2_K ? TILE_X_SIZES_Q2_K : \
+ type == GGML_TYPE_Q3_K ? TILE_X_SIZES_Q3_K : \
+ type == GGML_TYPE_Q4_K ? TILE_X_SIZES_Q4_K : \
+ type == GGML_TYPE_Q5_K ? TILE_X_SIZES_Q5_K : \
+ type == GGML_TYPE_Q6_K ? TILE_X_SIZES_Q6_K : \
+ tile_x_sizes{0, 0, 0, 0}
+
+static tile_x_sizes get_tile_x_sizes_host(const ggml_type type, const int mmq_y) {
+ GET_TILE_X_SIZES_BODY;
+}
+
+template
+static constexpr __device__ tile_x_sizes get_tile_x_sizes_device(ggml_type type) {
+ GET_TILE_X_SIZES_BODY;
+}
+
+// ------------------------------------------------------------
+
+template static __device__ __forceinline__ void load_tiles_q4_0(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+ const int kbx = threadIdx.x / QI4_0;
+ const int kqsx = threadIdx.x % QI4_0;
+
+ float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx;
+
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8(bxi->qs, kqsx);
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+ int i = i0 + threadIdx.y * QI4_0 + threadIdx.x / blocks_per_tile_x_row;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd;
+
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
+ const float * x_dmf = (const float *) x_dm;
+
+ int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+ }
+
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl
+ (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+ }
+ }
+}
+
+template static __device__ __forceinline__ void load_tiles_q4_1(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+ const int kbx = threadIdx.x / QI4_1;
+ const int kqsx = threadIdx.x % QI4_1;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx;
+
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+ int i = i0 + threadIdx.y * QI4_1 + threadIdx.x / blocks_per_tile_x_row;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd;
+
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
+
+ int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+ for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+ }
+
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl
+ (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+ }
+ }
+}
+
+template static __device__ __forceinline__ void load_tiles_q5_0(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+ const int kbx = threadIdx.x / QI5_0;
+ const int kqsx = threadIdx.x % QI5_0;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbx;
+
+ const int ql = get_int_from_uint8(bxi->qs, kqsx);
+ const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_0));
+
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
+ qs0 = __vsubss4(qs0, 0x10101010); // subtract 16
+
+ x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+0] = qs0;
+
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
+ qs1 = __vsubss4(qs1, 0x10101010); // subtract 16
+
+ x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+1] = qs1;
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+ float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+ int i = i0 + threadIdx.y * QI5_0 + threadIdx.x / blocks_per_tile_x_row;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbxd;
+
+ x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
+ const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
+ const float * x_dmf = (const float *) x_dm;
+ const float * y_df = (const float *) y_ds;
+
+ int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+ for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+ }
+
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+ }
+ }
+}
+
+
+template static __device__ __forceinline__ void load_tiles_q5_1(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+ const int kbx = threadIdx.x / QI5_1;
+ const int kqsx = threadIdx.x % QI5_1;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbx;
+
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+ const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_1));
+
+ int qs0 = (ql >> 0) & 0x0F0F0F0F;
+ qs0 |= (qh << 4) & 0x00000010; // 0 -> 4
+ qs0 |= (qh << 11) & 0x00001000; // 1 -> 12
+ qs0 |= (qh << 18) & 0x00100000; // 2 -> 20
+ qs0 |= (qh << 25) & 0x10000000; // 3 -> 28
+
+ x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+0] = qs0;
+
+ int qs1 = (ql >> 4) & 0x0F0F0F0F;
+ qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4
+ qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12
+ qs1 |= (qh << 2) & 0x00100000; // 18 -> 20
+ qs1 |= (qh << 9) & 0x10000000; // 19 -> 28
+
+ x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+1] = qs1;
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+ int i = i0 + threadIdx.y * QI5_1 + threadIdx.x / blocks_per_tile_x_row;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbxd;
+
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
+ const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1;
+
+ int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+ for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+ }
+
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+ }
+ }
+}
+
+template static __device__ __forceinline__ void load_tiles_q8_0(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+ const int kbx = threadIdx.x / QI8_0;
+ const int kqsx = threadIdx.x % QI8_0;
+ float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbx;
+
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_int8(bxi->qs, kqsx);
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+ int i = i0 + threadIdx.y * QI8_0 + threadIdx.x / blocks_per_tile_x_row;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbxd;
+
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const float * x_dmf = (const float *) x_dm;
+ const float * y_df = (const float *) y_ds;
+
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl
+ (&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
+ y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]);
+ }
+ }
+}
+
+template static __device__ __forceinline__ void load_tiles_q2_K(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+ GGML_UNUSED(x_qh);
+
+ const int kbx = threadIdx.x / QI2_K;
+ const int kqsx = threadIdx.x % QI2_K;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + kbx;
+
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+ int i = (i0 + threadIdx.y * QI2_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + kbxd;
+
+ x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+ }
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+ int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/4)) / (QI2_K/4);
+
+ x_sc[i * (WARP_SIZE/4) + i / 4 + threadIdx.x % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, threadIdx.x % (QI2_K/4));
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const int kbx = k0 / QI2_K;
+ const int ky = (k0 % QI2_K) * QR2_K;
+ const float * y_df = (const float *) y_ds;
+
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+ }
+
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+
+ const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE;
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq(
+ v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+ }
+ }
+}
+
+template static __device__ __forceinline__ void load_tiles_q3_K(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+
+ const int kbx = threadIdx.x / QI3_K;
+ const int kqsx = threadIdx.x % QI3_K;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + kbx;
+
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8(bxi->qs, kqsx);
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+ float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+ int i = (i0 + threadIdx.y * QI3_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + kbxd;
+
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
+ }
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+ int i = i0 + threadIdx.y * 2 + threadIdx.x / (WARP_SIZE/2);
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/2)) / (QI3_K/2);
+
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+ x_qh[i * (WARP_SIZE/2) + i / 2 + threadIdx.x % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, threadIdx.x % (QI3_K/2));
+ }
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+ int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/4)) / (QI3_K/4);
+
+ const int ksc = threadIdx.x % (QI3_K/4);
+
+ const int ksc_low = ksc % (QI3_K/8);
+ const int shift_low = 4 * (ksc / (QI3_K/8));
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+ const int ksc_high = QI3_K/8;
+ const int shift_high = 2 * ksc;
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
+
+ x_sc[i * (WARP_SIZE/4) + i / 4 + threadIdx.x % (WARP_SIZE/4)] = sc;
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const int kbx = k0 / QI3_K;
+ const int ky = (k0 % QI3_K) * QR3_K;
+ const float * x_dmf = (const float *) x_dm;
+ const float * y_df = (const float *) y_ds;
+
+ const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+ const int shift = 2 * ((ky % 32) / 8);
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+ const int vlh = (vh << 2) & 0x04040404;
+
+ v[l] = __vsubss4(vll, vlh);
+ }
+
+ const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE;
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
+ v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+ }
+ }
+}
+
+template static __device__ __forceinline__ void load_tiles_q4_K(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+ GGML_UNUSED(x_qh);
+
+ const int kbx = 0; // threadIdx.x / QI4_K
+ const int kqsx = threadIdx.x; // threadIdx.x % QI4_K
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + kbx;
+
+ x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+ int i = (i0 + threadIdx.y * QI4_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + kbxd;
+
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+ }
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+ int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI4_K/8);
+
+ const int * scales = (const int *) bxi->scales;
+
+ const int ksc = threadIdx.x % (WARP_SIZE/8);
+
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
+
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8);
+
+ const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE;
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
+ &x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+ }
+ }
+}
+
+template static __device__ __forceinline__ void load_tiles_q5_K(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+ GGML_UNUSED(x_qh);
+
+ const int kbx = 0; // threadIdx.x / QI5_K
+ const int kqsx = threadIdx.x; // threadIdx.x % QI5_K
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + kbx;
+ const int ky = QR5_K*kqsx;
+
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+ const int kq0 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + 0;
+ const int kq1 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + (QI5_K/4);
+
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+ int i = (i0 + threadIdx.y * QI5_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + kbxd;
+
+ x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+ }
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+ int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI5_K/8);
+
+ const int * scales = (const int *) bxi->scales;
+
+ const int ksc = threadIdx.x % (WARP_SIZE/8);
+
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
+
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
+
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k0;
+ const int index_y = j * WARP_SIZE + (QR5_K*k0) % WARP_SIZE;
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
+ &x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+ }
+ }
+}
+
+template static __device__ __forceinline__ void load_tiles_q6_K(
+ const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+ int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+ GGML_UNUSED(x_qh);
+
+ const int kbx = 0; // threadIdx.x / QI6_K
+ const int kqsx = threadIdx.x; // threadIdx.x % QI6_K
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+ int i = i0 + threadIdx.y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbx;
+ const int ky = QR6_K*kqsx;
+
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
+
+ const int kq0 = ky - ky % QI6_K + threadIdx.x % (QI6_K/2) + 0;
+ const int kq1 = ky - ky % QI6_K + threadIdx.x % (QI6_K/2) + (QI6_K/2);
+
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
+ }
+
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+ const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+ float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+ int i = (i0 + threadIdx.y * QI6_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbxd;
+
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
+ }
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+ int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+
+ if (need_check) {
+ i = min(i, i_max);
+ }
+
+ const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / 4;
+
+ x_sc[i * (WARP_SIZE/8) + i / 8 + threadIdx.x % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, threadIdx.x % (QI6_K/8));
+ }
+}
+
+template
+static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+ GGML_UNUSED(x_qh);
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = j0 + threadIdx.y;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = i0 + threadIdx.x;
+
+ const float * x_dmf = (const float *) x_dm;
+ const float * y_df = (const float *) y_ds;
+
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
+
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k0;
+ const int index_y = j * WARP_SIZE + (QR6_K*k0) % WARP_SIZE;
+ sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
+ &x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+ }
+ }
+}
+
+// -------------------------------------------------------------------------------------------------------------------------------------
+
+template
+struct mmq_type_traits;
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = true;
+ static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = true;
+ static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = false;
+ static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = true;
+ static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = false;
+ static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = false;
+ static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q2_K_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = false;
+ static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q3_K_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = true;
+ static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = true;
+ static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat;
+};
+
+template
+struct mmq_type_traits {
+ static constexpr bool need_sum = false;
+ static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
+ static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K;
+ static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat;
+};
+
+template
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+ __launch_bounds__(WARP_SIZE*nwarps, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#else
+#if __CUDA_ARCH__ >= CC_VOLTA
+ __launch_bounds__(WARP_SIZE*nwarps, 1)
+#else
+ __launch_bounds__(WARP_SIZE*nwarps, type == GGML_TYPE_Q2_K ? 1 : 2)
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+static __global__ void mul_mat_q(
+ const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
+ const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) {
+
+ // Skip unused template specializations for faster compilation:
+ if (mmq_x > get_mmq_x_max_device()) {
+ NO_DEVICE_CODE;
+ return;
+ }
+
+ constexpr int qk = ggml_cuda_type_traits::qk;
+ constexpr int qr = ggml_cuda_type_traits::qr;
+ constexpr int qi = ggml_cuda_type_traits::qi;
+ constexpr int mmq_y = get_mmq_y_device(mmq_x);
+ constexpr bool need_sum = mmq_type_traits::need_sum;
+ constexpr int vdr = mmq_type_traits::vdr;
+ constexpr load_tiles_mmq_t load_tiles = mmq_type_traits::load_tiles;
+ constexpr vec_dot_mmq_t vec_dot = mmq_type_traits::vec_dot;
+
+ constexpr tile_x_sizes txs = get_tile_x_sizes_device(type);
+
+ extern __shared__ char data_mul_mat_q[];
+ int * tile_x_ql = (int *) data_mul_mat_q;
+ half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql);
+ int * tile_x_qh = (int *) (tile_x_dm + txs.dm);
+ int * tile_x_sc = (int *) (tile_x_qh + txs.qh);
+ int * tile_y_qs = (int *) (tile_x_sc + txs.sc); // [mmq_x * WARP_SIZE]
+ half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1];
+
+ const block_q8_1 * y = (const block_q8_1 *) yc;
+
+ const int blocks_per_row_x = ne00 / qk;
+ const int blocks_per_col_y = ne10 / QK8_1;
+ const int blocks_per_warp = WARP_SIZE / qi;
+
+ const int & ne1 = ne11;
+
+ const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
+
+ float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
+
+ for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
+
+ load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00);
+
+#pragma unroll
+ for (int kr = 0; kr < qr; ++kr) {
+ const int kqs = kr*WARP_SIZE + threadIdx.x;
+ const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_x; i0 += nwarps) {
+ const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses
+
+ const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd];
+
+ const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE;
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+ }
+
+#pragma unroll
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
+ const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1);
+
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+ const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds;
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
+ if (need_sum) {
+ *dsi_dst = *dsi_src;
+ } else {
+ float * dfi_dst = (float *) dsi_dst;
+ *dfi_dst = __low2float(*dsi_src);
+ }
+ }
+
+ __syncthreads();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+ for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) {
+ vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0);
+ }
+
+ __syncthreads();
+ }
+ }
+
+#pragma unroll
+ for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+ const int j = blockIdx.y*mmq_x + j0 + threadIdx.y;
+
+ if (j >= ne1) {
+ return;
+ }
+
+#pragma unroll
+ for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+ const int i = blockIdx.x*mmq_y + i0 + threadIdx.x;
+
+ if (need_check && i >= ne0) {
+ continue;
+ }
+
+ dst[j*ne0 + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
+ }
+ }
+}
+
+struct mmq_args {
+ const char * x; const char * y; float * dst;
+ int64_t ne00; int64_t ne01; int64_t stride00;
+ int64_t ne10; int64_t ne11;
+ int64_t ne0;
+};
+
+template
+static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
+ const int id = ggml_cuda_get_device();
+ const int cc = ggml_cuda_info().devices[id].cc;
+ const int mmq_y = get_mmq_y_host(cc, mmq_x);
+
+ const int block_num_x = (args.ne01 + mmq_y - 1) / mmq_y;
+ const int block_num_y = (args.ne11 + mmq_x - 1) / mmq_x;
+ const dim3 block_nums(block_num_x, block_num_y, 1);
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+ const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
+ const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int);
+ const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
+ const int shmem = shmem_x + shmem_y;
+
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+ static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
+ if (!shmem_limit_raised[id]) {
+ CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+ CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+ shmem_limit_raised[id] = true;
+ }
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+
+ if (args.ne01 % mmq_y == 0) {
+ const bool need_check = false;
+ mul_mat_q<<>>
+ (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
+ } else {
+ const bool need_check = true;
+ mul_mat_q<<>>
+ (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
+ }
+}
+
+template
+void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) {
+ const int id = ggml_cuda_get_device();
+ const int nsm = ggml_cuda_info().devices[id].nsm;
+ const int cc = ggml_cuda_info().devices[id].cc;
+
+ const int mmq_x_max = get_mmq_x_max_host(cc);
+ const int mmq_y = get_mmq_y_host(cc, mmq_x_max);
+ const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
+
+ int mmq_x_best = 0;
+ int nwaves_best = INT_MAX;
+
+ for (int mmq_x = 8; mmq_x <= mmq_x_max && nwaves_best > 1; mmq_x += 8) {
+ const int block_num_x = (args.ne11 + mmq_x - 1) / mmq_x;
+ const int nwaves = (block_num_x*block_num_y + nsm - 1) / nsm;
+
+ if (nwaves < nwaves_best) {
+ mmq_x_best = mmq_x;
+ nwaves_best = nwaves;
+ }
+ }
+
+ switch (mmq_x_best) {
+ case 8:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 16:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 24:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 32:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 40:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 48:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 56:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 64:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 72:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 80:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 88:
+ launch_mul_mat_q(args, stream);
+ break;
+ case 96:
+ launch_mul_mat_q