cuda : add CUDA_USE_TENSOR_CORES and GGML_CUDA_FORCE_MMQ macros

2023-10-25 18:48:36 +03:00 · 2023-10-25 18:48:36 +03:00 · a4e15a36e4
commit a4e15a36e4
parent 4c6744b526
3 changed files with 104 additions and 22 deletions
--- a/llama.h
+++ b/llama.h
@ -178,7 +178,7 @@ extern "C" {
        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model

        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool embedding;  // embedding mode only