CUDA: quantized KV support for FA vec

2024-05-21 19:38:25 +02:00 · 2024-05-21 19:38:25 +02:00 · 672244a88b
commit 672244a88b
parent 10b1e45876
11 changed files with 826 additions and 142 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -106,6 +106,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
 option(LLAMA_CUDA_NO_VMM                     "llama: do not try to use CUDA VMM"                OFF)
+option(LLAMA_CUDA_FA_ALL_QUANTS              "llama: compile all quants for FlashAttention"     OFF)

 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
@ -427,6 +428,9 @@ if (LLAMA_CUDA)
        if (LLAMA_CUDA_NO_PEER_COPY)
            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
        endif()
+        if (LLAMA_CUDA_FA_ALL_QUANTS)
+            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+        endif()

        if (LLAMA_STATIC)
            if (WIN32)