Merge branch 'gg/flash-attn' of https://github.com/ggerganov/llama.cpp into flash-attn-cuda
This commit is contained in:
commit
a689b02ad3
20 changed files with 1437 additions and 512 deletions
|
@ -12,9 +12,6 @@
|
|||
#include <vector>
|
||||
#include <map>
|
||||
#include <array>
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#include <hip/hip_runtime.h>
|
||||
|
@ -118,6 +115,11 @@
|
|||
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
// ggml-cuda need half type so keep ggml headers include at last
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
||||
|
||||
#define CC_PASCAL 600
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue