Merge branch 'gg/flash-attn' of https://github.com/ggerganov/llama.cpp into flash-attn-cuda

This commit is contained in:
FSSRepo 2024-01-23 13:51:59 -05:00
commit a689b02ad3
20 changed files with 1437 additions and 512 deletions

View file

@ -12,9 +12,6 @@
#include <vector>
#include <map>
#include <array>
#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
@ -118,6 +115,11 @@
#endif // defined(GGML_USE_HIPBLAS)
// ggml-cuda need half type so keep ggml headers include at last
#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
#define CC_PASCAL 600