CUDA: generalize FP16 fattn vec kernel (#7061)

* CUDA: generalize FP16 fattn vec kernel

* disable unsupported head sizes for AMD in test

* try AMD fix

* fix batch size 2-8

* partially revert changes
This commit is contained in:
Johannes Gäßler 2024-05-09 14:32:02 +02:00 committed by GitHub
parent f31ec120bc
commit a743d76a01
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 374 additions and 220 deletions

View file

@ -2175,7 +2175,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_timestep_embedding());
test_cases.emplace_back(new test_leaky_relu());
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
for (int hs : { 64, 128, }) { // other head sizes not implemented
#else
for (int hs : { 64, 80, 128, 256, }) {
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
for (int nh : { 32, }) {
for (int kv : { 512, 1024, }) {
for (int nb : { 1, 2, 4, 8, }) {