CUDA: refactor host code, dyn. par. blocks

2024-04-09 11:39:16 +02:00 · 2024-04-09 11:39:16 +02:00 · 34f93bbb39
commit 34f93bbb39
parent 5668c79ea0
3 changed files with 258 additions and 311 deletions
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@ -390,6 +390,11 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 }
 #endif // defined(GGML_USE_HIPBLAS)

+#define FP16_AVAILABLE     defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
+    defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
+#define FP16_MMA_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
+                                        defined(RDNA3) : __CUDA_ARCH__ >= CC_VOLTA
+
 // TODO: move to ggml-common.h
 static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

@ -403,6 +408,7 @@ struct ggml_cuda_device_info {

    struct cuda_device_info {
        int     cc;                 // compute capability
+        int     nsm;                // number of streaming multiprocessors
        size_t  smpb;               // max. shared memory per block
        bool    vmm;                // virtual memory support
        size_t  vmm_granularity;    // granularity of virtual memory