|
@@ -71,6 +71,47 @@
|
|
|
#define GGML_CUDA_CC_QY1 210
|
|
#define GGML_CUDA_CC_QY1 210
|
|
|
#define GGML_CUDA_CC_QY2 220
|
|
#define GGML_CUDA_CC_QY2 220
|
|
|
|
|
|
|
|
|
|
+#ifdef __CUDA_ARCH_LIST__
|
|
|
|
|
+constexpr bool ggml_cuda_has_arch_impl(int) {
|
|
|
|
|
+ return false;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+template<class ... Archs>
|
|
|
|
|
+constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
|
|
|
|
|
+ return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+constexpr bool ggml_cuda_has_arch(const int arch) {
|
|
|
|
|
+ return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur) {
|
|
|
|
|
+ if (cur == 0) {
|
|
|
|
|
+ GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
|
|
|
|
|
+ }
|
|
|
|
|
+ return cur;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+template<class ... Archs>
|
|
|
|
|
+constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
|
|
|
|
|
+ if (first <= arch && first > cur) {
|
|
|
|
|
+ return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
|
|
|
|
|
+ return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__);
|
|
|
|
|
+}
|
|
|
|
|
+#else
|
|
|
|
|
+static int ggml_cuda_highest_compiled_arch(const int arch) {
|
|
|
|
|
+ return arch;
|
|
|
|
|
+}
|
|
|
|
|
+#endif // __CUDA_ARCH_LIST__
|
|
|
|
|
+
|
|
|
|
|
+// ---------------------------------------------------------------------------------------------------------
|
|
|
|
|
+
|
|
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
|
|
|
|
|
|
|
#if defined(_MSC_VER)
|
|
#if defined(_MSC_VER)
|
|
@@ -162,18 +203,32 @@ typedef float2 dfloat2;
|
|
|
#define FLASH_ATTN_AVAILABLE
|
|
#define FLASH_ATTN_AVAILABLE
|
|
|
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
|
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
|
|
|
|
|
|
|
-static constexpr bool fast_fp16_available(const int cc) {
|
|
|
|
|
|
|
+static bool fp16_available(const int cc) {
|
|
|
|
|
+ return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static bool fast_fp16_available(const int cc) {
|
|
|
|
|
+ return fp16_available(cc) && cc != 610;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
|
|
|
+static bool fast_fp16_hardware_available(const int cc) {
|
|
|
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
|
|
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// Any FP16 tensor cores are available.
|
|
|
|
|
-static constexpr bool fp16_mma_available(const int cc) {
|
|
|
|
|
|
|
+// Any FP16 tensor core instructions are available for ggml code.
|
|
|
|
|
+static bool fp16_mma_available(const int cc) {
|
|
|
|
|
+ return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// To be used for feature selection of external libraries, e.g. cuBLAS.
|
|
|
|
|
+static bool fp16_mma_hardware_available(const int cc) {
|
|
|
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
|
|
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
|
|
-static constexpr bool new_mma_available(const int cc) {
|
|
|
|
|
- return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
|
|
|
|
|
|
+static bool new_mma_available(const int cc) {
|
|
|
|
|
+ return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
|
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|