CUDA: refactor host code, dyn. par. blocks
This commit is contained in:
parent
5668c79ea0
commit
34f93bbb39
3 changed files with 258 additions and 311 deletions
|
@ -390,6 +390,11 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|||
}
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
|
||||
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
|
||||
#define FP16_MMA_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
|
||||
defined(RDNA3) : __CUDA_ARCH__ >= CC_VOLTA
|
||||
|
||||
// TODO: move to ggml-common.h
|
||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
|
@ -403,6 +408,7 @@ struct ggml_cuda_device_info {
|
|||
|
||||
struct cuda_device_info {
|
||||
int cc; // compute capability
|
||||
int nsm; // number of streaming multiprocessors
|
||||
size_t smpb; // max. shared memory per block
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue