hace 4 semanas · 54189c0d39
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -78,27 +78,25 @@ namespace ggml_cuda_mma {
 
				         // MIRRORED == Each data value is held exactly once per thread subgroup.
			
 
				         DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA.
			
 
				         DATA_LAYOUT_J_MAJOR           = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3.
			
 
				-        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20,
			
 
				+        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20, // Volta, matrix A&B for RDNA3.
			
 
				         DATA_LAYOUT_J_MAJOR_MIRRORED  = 30,
			
 
				-        DATA_LAYOUT_I_MAJOR_DUAL      = 40, // Matrix A&B for RDNA3.
			
 
				     };
			
 
				     // Implemented mma combinations are:
			
 
				     //   - (I_MAJOR, I_MAJOR)          -> I_MAJOR
			
 
				     //   - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
			
 
				     //   - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
			
 
				 
			
 
				-    constexpr bool is_i_major(const data_layout dl) {
			
 
				+    static constexpr bool is_i_major(const data_layout dl) {
			
 
				         return dl == DATA_LAYOUT_I_MAJOR ||
			
 
				-               dl == DATA_LAYOUT_I_MAJOR_MIRRORED ||
			
 
				-               dl == DATA_LAYOUT_I_MAJOR_DUAL;
			
 
				+               dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
			
 
				     }
			
 
				 
			
 
				-    constexpr data_layout get_input_data_layout() {
			
 
				-#if defined(RDNA3)
			
 
				-        return DATA_LAYOUT_I_MAJOR_DUAL;
			
 
				+    static constexpr __device__ data_layout get_input_data_layout() {
			
 
				+#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
			
 
				+        return DATA_LAYOUT_I_MAJOR_MIRRORED;
			
 
				 #else
			
 
				         return DATA_LAYOUT_I_MAJOR;
			
 
				-#endif // defined(RDNA3)
			
 
				+#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
			
 
				     }
			
 
				 
			
 
				     template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
			
@@ -462,23 +460,27 @@ namespace ggml_cuda_mma {
 
				         }
			
 
				     };
			
 
				 
			
 
				-    template <int I_, int J_>
			
 
				-    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
			
 
				+    template <int I_, int J_, typename T>
			
 
				+    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_MIRRORED> {
			
 
				         static constexpr int         I  = I_;
			
 
				         static constexpr int         J  = J_;
			
 
				         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
			
 
				-        static constexpr int         ne = I * J / (WARP_SIZE/4);
			
 
				 
			
 
				-        half2 x[ne] = {{0.0f, 0.0f}};
			
 
				+        // RDNA3
			
 
				+        static constexpr int         ne = I * J / 32 * 2;
			
 
				+
			
 
				+        T x[ne] = {0};
			
 
				 
			
 
				         static constexpr __device__ bool supported() {
			
 
				-            if (I ==  8 && J ==  4) return true;
			
 
				+            if (I == 16 && J == 16) return true;
			
 
				+            if (I == 16 && J == 8)  return true;
			
 
				+            if (I == 16 && J == 4)  return true;
			
 
				             return false;
			
 
				         }
			
 
				 
			
 
				         static __device__ __forceinline__ int get_i(const int /*l*/) {
			
 
				-            if constexpr (I == 8 && J == 4) {
			
 
				-                return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
			
 
				+            if constexpr (supported()) {
			
 
				+                return threadIdx.x % 16;
			
 
				             } else {
			
 
				                 NO_DEVICE_CODE;
			
 
				                 return -1;
			
@@ -486,7 +488,7 @@ namespace ggml_cuda_mma {
 
				         }
			
 
				 
			
 
				         static __device__ __forceinline__ int get_j(const int l) {
			
 
				-            if constexpr (I == 8 && J == 4) {
			
 
				+            if constexpr (supported()) {
			
 
				                 return l;
			
 
				             } else {
			
 
				                 NO_DEVICE_CODE;
			
@@ -496,10 +498,27 @@ namespace ggml_cuda_mma {
 
				     };
			
 
				 
			
 
				     template <int I_, int J_>
			
 
				-    struct tile<I_, J_, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> {
			
 
				+    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
			
 
				         static constexpr int         I  = I_;
			
 
				         static constexpr int         J  = J_;
			
 
				-        static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED;
			
 
				+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
			
 
				+#if defined(RDNA3)
			
 
				+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
			
 
				+
			
 
				+        half2 x[ne] = {{0.0f, 0.0f}};
			
 
				+
			
 
				+        static constexpr __device__ bool supported() {
			
 
				+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
			
 
				+        }
			
 
				+
			
 
				+        static __device__ __forceinline__ int get_i(const int l) {
			
 
				+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
			
 
				+        }
			
 
				+
			
 
				+        static __device__ __forceinline__ int get_j(const int l) {
			
 
				+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
			
 
				+        }
			
 
				+#else // Volta
			
 
				         static constexpr int         ne = I * J / (WARP_SIZE/4);
			
 
				 
			
 
				         half2 x[ne] = {{0.0f, 0.0f}};
			
@@ -509,9 +528,9 @@ namespace ggml_cuda_mma {
 
				             return false;
			
 
				         }
			
 
				 
			
 
				-        static __device__ __forceinline__ int get_i(const int l) {
			
 
				+        static __device__ __forceinline__ int get_i(const int /*l*/) {
			
 
				             if constexpr (I == 8 && J == 4) {
			
 
				-                return ((l / 2) * 4) + (threadIdx.x % 4);
			
 
				+                return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
			
 
				             } else {
			
 
				                 NO_DEVICE_CODE;
			
 
				                 return -1;
			
@@ -520,34 +539,54 @@ namespace ggml_cuda_mma {
 
				 
			
 
				         static __device__ __forceinline__ int get_j(const int l) {
			
 
				             if constexpr (I == 8 && J == 4) {
			
 
				-                return ((threadIdx.x / 16) * 2) + (l % 2);
			
 
				+                return l;
			
 
				             } else {
			
 
				                 NO_DEVICE_CODE;
			
 
				                 return -1;
			
 
				             }
			
 
				         }
			
 
				+#endif // defined(RDNA3)
			
 
				     };
			
 
				 
			
 
				-    template <int I_, int J_, typename T>
			
 
				-    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_DUAL> {
			
 
				+    template <int I_, int J_>
			
 
				+    struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR_MIRRORED> {
			
 
				         static constexpr int         I  = I_;
			
 
				         static constexpr int         J  = J_;
			
 
				-        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
			
 
				+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
			
 
				+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
			
 
				 
			
 
				-        static constexpr int         ne = I * J / 32 * 2;
			
 
				+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
			
 
				 
			
 
				-        T x[ne] = {0};
			
 
				+        static constexpr __device__ bool supported() {
			
 
				+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
			
 
				+        }
			
 
				+
			
 
				+        static __device__ __forceinline__ int get_i(const int l) {
			
 
				+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
			
 
				+        }
			
 
				+
			
 
				+        static __device__ __forceinline__ int get_j(const int l) {
			
 
				+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    template <int I_, int J_>
			
 
				+    struct tile<I_, J_, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> {
			
 
				+        static constexpr int         I  = I_;
			
 
				+        static constexpr int         J  = J_;
			
 
				+        static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED;
			
 
				+        static constexpr int         ne = I * J / (WARP_SIZE/4);
			
 
				+
			
 
				+        half2 x[ne] = {{0.0f, 0.0f}};
			
 
				 
			
 
				         static constexpr __device__ bool supported() {
			
 
				-            if (I == 16 && J == 16) return true;
			
 
				-            if (I == 16 && J == 8)  return true;
			
 
				-            if (I == 16 && J == 4)  return true;
			
 
				+            if (I ==  8 && J ==  4) return true;
			
 
				             return false;
			
 
				         }
			
 
				 
			
 
				         static __device__ __forceinline__ int get_i(const int l) {
			
 
				-            if constexpr (supported()) {
			
 
				-                return threadIdx.x % 16;
			
 
				+            if constexpr (I == 8 && J == 4) {
			
 
				+                return ((l / 2) * 4) + (threadIdx.x % 4);
			
 
				             } else {
			
 
				                 NO_DEVICE_CODE;
			
 
				                 return -1;
			
@@ -555,8 +594,8 @@ namespace ggml_cuda_mma {
 
				         }
			
 
				 
			
 
				         static __device__ __forceinline__ int get_j(const int l) {
			
 
				-            if constexpr (supported()) {
			
 
				-                return l;
			
 
				+            if constexpr (I == 8 && J == 4) {
			
 
				+                return ((threadIdx.x / 16) * 2) + (l % 2);
			
 
				             } else {
			
 
				                 NO_DEVICE_CODE;
			
 
				                 return -1;