|
@@ -173,6 +173,9 @@ namespace ggml_cuda_mma {
|
|
|
#elif defined(AMD_WMMA_AVAILABLE)
|
|
#elif defined(AMD_WMMA_AVAILABLE)
|
|
|
#if defined(RDNA4)
|
|
#if defined(RDNA4)
|
|
|
static constexpr int ne = I * J / 32;
|
|
static constexpr int ne = I * J / 32;
|
|
|
|
|
+#elif defined(RDNA3)
|
|
|
|
|
+ static constexpr int ne = (I == 16 && J == 16) ? I * J / 32 : I * J / 16;
|
|
|
|
|
+#endif // defined(RDNA4)
|
|
|
T x[ne] = {0};
|
|
T x[ne] = {0};
|
|
|
|
|
|
|
|
static constexpr __device__ bool supported() {
|
|
static constexpr __device__ bool supported() {
|
|
@@ -182,7 +185,11 @@ namespace ggml_cuda_mma {
|
|
|
|
|
|
|
|
static __device__ __forceinline__ int get_i(const int l) {
|
|
static __device__ __forceinline__ int get_i(const int l) {
|
|
|
if constexpr (I == 16 && J == 16) {
|
|
if constexpr (I == 16 && J == 16) {
|
|
|
|
|
+#if defined(RDNA4)
|
|
|
return 8 * (threadIdx.x / 16) + l;
|
|
return 8 * (threadIdx.x / 16) + l;
|
|
|
|
|
+#elif defined(RDNA3)
|
|
|
|
|
+ return 2 * l + (threadIdx.x / 16);
|
|
|
|
|
+#endif // defined(RDNA4)
|
|
|
} else {
|
|
} else {
|
|
|
NO_DEVICE_CODE;
|
|
NO_DEVICE_CODE;
|
|
|
return -1;
|
|
return -1;
|
|
@@ -197,7 +204,6 @@ namespace ggml_cuda_mma {
|
|
|
return -1;
|
|
return -1;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
-#endif
|
|
|
|
|
#else
|
|
#else
|
|
|
static constexpr int ne = I * J / 32;
|
|
static constexpr int ne = I * J / 32;
|
|
|
T x[ne] = {0};
|
|
T x[ne] = {0};
|
|
@@ -284,6 +290,7 @@ namespace ggml_cuda_mma {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
#elif defined(AMD_WMMA_AVAILABLE)
|
|
#elif defined(AMD_WMMA_AVAILABLE)
|
|
|
|
|
+
|
|
|
static constexpr int ne = I * J / 32;
|
|
static constexpr int ne = I * J / 32;
|
|
|
half2 x[ne] = {{0.0f, 0.0f}};
|
|
half2 x[ne] = {{0.0f, 0.0f}};
|
|
|
|
|
|
|
@@ -544,16 +551,32 @@ namespace ggml_cuda_mma {
|
|
|
} else if constexpr (std::is_same_v<T, int>) {
|
|
} else if constexpr (std::is_same_v<T, int>) {
|
|
|
if constexpr (I == 16 && J == 4) {
|
|
if constexpr (I == 16 && J == 4) {
|
|
|
int64_t * xi = (int64_t *) t.x;
|
|
int64_t * xi = (int64_t *) t.x;
|
|
|
|
|
+#if defined(RDNA4)
|
|
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
|
|
xi[0] = xs[0];
|
|
xi[0] = xs[0];
|
|
|
-
|
|
|
|
|
|
|
+#elif defined(RDNA3)
|
|
|
|
|
+ static_assert(tile<I,J,T>::ne >= 4, "fragment too small");
|
|
|
|
|
+ const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
|
|
|
|
|
+ xi[0] = xs[0];
|
|
|
|
|
+ xi[1] = xs[1];
|
|
|
|
|
+#endif // defined(RDNA4)
|
|
|
}else if constexpr (I == 16 && J == 8) {
|
|
}else if constexpr (I == 16 && J == 8) {
|
|
|
int64_t * xi = (int64_t *) t.x;
|
|
int64_t * xi = (int64_t *) t.x;
|
|
|
|
|
+#if defined(RDNA4)
|
|
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
|
|
xi[0] = xs[0];
|
|
xi[0] = xs[0];
|
|
|
|
|
|
|
|
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
|
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
|
|
xi[1] = xs1[0];
|
|
xi[1] = xs1[0];
|
|
|
|
|
+#elif defined(RDNA3)
|
|
|
|
|
+ static_assert(tile<I,J,T>::ne >= 8, "fragment too small");
|
|
|
|
|
+ const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
|
|
|
|
|
+ // contiguous four 64-bit chunks per lane for the wider RDNA3 fragment
|
|
|
|
|
+ xi[0] = xs[0];
|
|
|
|
|
+ xi[1] = xs[1];
|
|
|
|
|
+ const int64_t * xs1 = xs + 2;
|
|
|
|
|
+ xi[2] = xs1[0];
|
|
|
|
|
+ xi[3] = xs1[1];
|
|
|
|
|
|
|
|
}else{
|
|
}else{
|
|
|
NO_DEVICE_CODE;
|
|
NO_DEVICE_CODE;
|
|
@@ -561,6 +584,7 @@ namespace ggml_cuda_mma {
|
|
|
} else {
|
|
} else {
|
|
|
NO_DEVICE_CODE;
|
|
NO_DEVICE_CODE;
|
|
|
}
|
|
}
|
|
|
|
|
+#endif // defined(RDNA4)
|
|
|
#else
|
|
#else
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
|
for (int l = 0; l < t.ne; ++l) {
|
|
for (int l = 0; l < t.ne; ++l) {
|
|
@@ -858,12 +882,14 @@ namespace ggml_cuda_mma {
|
|
|
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
|
|
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
|
|
|
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
|
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
|
|
#elif defined(AMD_WMMA_AVAILABLE)
|
|
#elif defined(AMD_WMMA_AVAILABLE)
|
|
|
|
|
+#if defined(RDNA4)
|
|
|
using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
|
|
using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
|
|
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
|
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
|
|
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
|
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
|
|
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
|
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
|
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
|
|
|
|
+#endif // RDNA4
|
|
|
#else
|
|
#else
|
|
|
GGML_UNUSED_VARS(D, A, B);
|
|
GGML_UNUSED_VARS(D, A, B);
|
|
|
NO_DEVICE_CODE;
|
|
NO_DEVICE_CODE;
|
|
@@ -873,12 +899,14 @@ namespace ggml_cuda_mma {
|
|
|
static __device__ __forceinline__ void mma(
|
|
static __device__ __forceinline__ void mma(
|
|
|
tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
|
|
tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
|
|
|
#if defined(AMD_WMMA_AVAILABLE)
|
|
#if defined(AMD_WMMA_AVAILABLE)
|
|
|
|
|
+#if defined(RDNA4)
|
|
|
using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
|
|
using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
|
|
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
|
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
|
|
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
|
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
|
|
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
|
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
|
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
|
|
|
|
+#endif // RDNA4
|
|
|
#else
|
|
#else
|
|
|
GGML_UNUSED_VARS(D, A, B);
|
|
GGML_UNUSED_VARS(D, A, B);
|
|
|
NO_DEVICE_CODE;
|
|
NO_DEVICE_CODE;
|
|
@@ -907,14 +935,14 @@ namespace ggml_cuda_mma {
|
|
|
#endif // defined(CDNA3)
|
|
#endif // defined(CDNA3)
|
|
|
|
|
|
|
|
#elif defined(AMD_WMMA_AVAILABLE)
|
|
#elif defined(AMD_WMMA_AVAILABLE)
|
|
|
- using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
|
|
|
|
- int32x2_t * a_vec = (int32x2_t *) A.x;
|
|
|
|
|
- int32x2_t * b_vec = (int32x2_t *) B.x;
|
|
|
|
|
|
|
|
|
|
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
|
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
|
|
int32x8_t * acc = (int32x8_t *) D.x;
|
|
int32x8_t * acc = (int32x8_t *) D.x;
|
|
|
|
|
|
|
|
#if defined(RDNA4)
|
|
#if defined(RDNA4)
|
|
|
|
|
+ using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
|
|
|
|
+ int32x2_t * a_vec = (int32x2_t *) A.x;
|
|
|
|
|
+ int32x2_t * b_vec = (int32x2_t *) B.x;
|
|
|
|
|
|
|
|
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
|
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
|
|
true,
|
|
true,
|
|
@@ -933,7 +961,30 @@ namespace ggml_cuda_mma {
|
|
|
acc[0],
|
|
acc[0],
|
|
|
true
|
|
true
|
|
|
);
|
|
);
|
|
|
-#endif // defined(RDNA4)
|
|
|
|
|
|
|
+
|
|
|
|
|
+#elif defined(RDNA3)
|
|
|
|
|
+ using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
|
|
|
|
+ int32x4_t * a_vec = (int32x4_t *) A.x;
|
|
|
|
|
+ int32x4_t * b_vec = (int32x4_t *) B.x;
|
|
|
|
|
+
|
|
|
|
|
+ acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
|
|
|
|
+ true,
|
|
|
|
|
+ a_vec[0],
|
|
|
|
|
+ true,
|
|
|
|
|
+ b_vec[0],
|
|
|
|
|
+ acc[0],
|
|
|
|
|
+ true
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
|
|
|
|
+ true,
|
|
|
|
|
+ a_vec[1],
|
|
|
|
|
+ true,
|
|
|
|
|
+ b_vec[1],
|
|
|
|
|
+ acc[0],
|
|
|
|
|
+ true
|
|
|
|
|
+ );
|
|
|
|
|
+#endif // RDNA4
|
|
|
|
|
|
|
|
#else
|
|
#else
|
|
|
GGML_UNUSED_VARS(D, A, B);
|
|
GGML_UNUSED_VARS(D, A, B);
|
|
@@ -1020,27 +1071,40 @@ namespace ggml_cuda_mma {
|
|
|
static __device__ __forceinline__ void mma(
|
|
static __device__ __forceinline__ void mma(
|
|
|
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
|
|
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
|
|
|
#if defined(AMD_WMMA_AVAILABLE)
|
|
#if defined(AMD_WMMA_AVAILABLE)
|
|
|
- using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
|
|
|
|
- int32x2_t * a_vec = (int32x2_t *) A.x;
|
|
|
|
|
- int32x2_t * b_vec = (int32x2_t *) B.x;
|
|
|
|
|
-
|
|
|
|
|
- using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
|
|
|
|
- int32x8_t * acc = (int32x8_t *) D.x;
|
|
|
|
|
-
|
|
|
|
|
- acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
|
|
|
|
- true,
|
|
|
|
|
- a_vec[0],
|
|
|
|
|
- true,
|
|
|
|
|
- b_vec[0],
|
|
|
|
|
- acc[0],
|
|
|
|
|
- false
|
|
|
|
|
- );
|
|
|
|
|
|
|
+ using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
|
|
|
|
+ int32x8_t * acc = (int32x8_t *) D.x;
|
|
|
|
|
+#if defined(RDNA4)
|
|
|
|
|
+ using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
|
|
|
|
+ int32x2_t * a_vec = (int32x2_t *) A.x;
|
|
|
|
|
+ int32x2_t * b_vec = (int32x2_t *) B.x;
|
|
|
|
|
+
|
|
|
|
|
+ acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
|
|
|
|
+ true,
|
|
|
|
|
+ a_vec[0],
|
|
|
|
|
+ true,
|
|
|
|
|
+ b_vec[0],
|
|
|
|
|
+ acc[0],
|
|
|
|
|
+ false
|
|
|
|
|
+ );
|
|
|
|
|
+#elif defined(RDNA3)
|
|
|
|
|
+ using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
|
|
|
|
+ int32x4_t * a_vec = (int32x4_t *) A.x;
|
|
|
|
|
+ int32x4_t * b_vec = (int32x4_t *) B.x;
|
|
|
|
|
+
|
|
|
|
|
+ acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
|
|
|
|
+ true,
|
|
|
|
|
+ a_vec[0],
|
|
|
|
|
+ true,
|
|
|
|
|
+ b_vec[0],
|
|
|
|
|
+ acc[0],
|
|
|
|
|
+ false
|
|
|
|
|
+ );
|
|
|
|
|
+#endif // RDNA4
|
|
|
#else
|
|
#else
|
|
|
GGML_UNUSED(D);
|
|
GGML_UNUSED(D);
|
|
|
GGML_UNUSED(A);
|
|
GGML_UNUSED(A);
|
|
|
GGML_UNUSED(B);
|
|
GGML_UNUSED(B);
|
|
|
NO_DEVICE_CODE;
|
|
NO_DEVICE_CODE;
|
|
|
-#endif
|
|
|
|
|
|
|
+#endif // AMD_WMMA_AVAILABLE
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
-
|
|
|