Merge branch 'gg/flash-attn' of https://github.com/ggerganov/llama.cpp into flash-attn-cuda

2024-01-23 13:51:59 -05:00 · 2024-01-23 13:51:59 -05:00 · a689b02ad3
commit a689b02ad3
parent fded2e6a11 17720fad66
20 changed files with 1437 additions and 512 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -12,9 +12,6 @@
 #include <vector>
 #include <map>
 #include <array>
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"

 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
@ -118,6 +115,11 @@

 #endif // defined(GGML_USE_HIPBLAS)

+// ggml-cuda need half type so keep ggml headers include at last
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
 #define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)

 #define CC_PASCAL     600