|
|
@@ -41,14 +41,17 @@
|
|
|
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
|
|
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
|
|
|
|
|
-#define GGML_CUDA_CC_PASCAL 600
|
|
|
-#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
|
|
-#define GGML_CUDA_CC_VOLTA 700
|
|
|
-#define GGML_CUDA_CC_TURING 750
|
|
|
-#define GGML_CUDA_CC_AMPERE 800
|
|
|
-#define GGML_CUDA_CC_ADA_LOVELACE 890
|
|
|
-#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
|
|
|
-
|
|
|
+#define GGML_CUDA_CC_PASCAL 600
|
|
|
+#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
|
|
+#define GGML_CUDA_CC_VOLTA 700
|
|
|
+#define GGML_CUDA_CC_TURING 750
|
|
|
+#define GGML_CUDA_CC_AMPERE 800
|
|
|
+#define GGML_CUDA_CC_ADA_LOVELACE 890
|
|
|
+#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
|
|
|
+#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
|
|
|
+#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
|
|
|
+
|
|
|
+// AMD
|
|
|
// GCN/CNDA, wave size is 64
|
|
|
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
|
|
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
|
|
@@ -70,8 +73,17 @@
|
|
|
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
|
|
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
|
|
|
|
|
-#define GGML_CUDA_CC_QY1 210
|
|
|
-#define GGML_CUDA_CC_QY2 220
|
|
|
+// Moore Threads
|
|
|
+#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
|
|
|
+
|
|
|
+#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
|
|
+#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
|
|
+#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
|
|
+
|
|
|
+#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
|
|
+#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
|
|
+#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
|
|
|
+#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
|
|
|
|
|
|
#ifdef __CUDA_ARCH_LIST__
|
|
|
constexpr bool ggml_cuda_has_arch_impl(int) {
|
|
|
@@ -209,21 +221,21 @@ typedef float2 dfloat2;
|
|
|
#define CP_ASYNC_AVAILABLE
|
|
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
|
|
|
|
|
-#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
|
|
+#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
|
|
|
#define FLASH_ATTN_AVAILABLE
|
|
|
-#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
|
|
+#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
|
|
|
|
|
|
static bool fp16_available(const int cc) {
|
|
|
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
|
|
}
|
|
|
|
|
|
static bool fast_fp16_available(const int cc) {
|
|
|
- return fp16_available(cc) && cc != 610;
|
|
|
+ return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
|
|
|
}
|
|
|
|
|
|
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
|
static bool fast_fp16_hardware_available(const int cc) {
|
|
|
- return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
|
|
|
+ return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
|
|
|
}
|
|
|
|
|
|
// Any FP16 tensor core instructions are available for ggml code.
|
|
|
@@ -231,20 +243,20 @@ static bool fp16_mma_available(const int cc) {
|
|
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
|
|
return false;
|
|
|
#else
|
|
|
- return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
|
|
|
- GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
|
|
|
+ return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
|
|
|
+ GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
|
|
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
|
|
}
|
|
|
|
|
|
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
|
static bool fp16_mma_hardware_available(const int cc) {
|
|
|
- return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ||
|
|
|
- GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
|
|
|
+ return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA ||
|
|
|
+ GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
|
|
|
}
|
|
|
|
|
|
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
|
static bool new_mma_available(const int cc) {
|
|
|
- return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
|
|
+ return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
|
|
}
|
|
|
|
|
|
static bool cp_async_available(const int cc) {
|