|
|
@@ -288,6 +288,9 @@ struct vk_device_struct {
|
|
|
bool coopmat_acc_f32_support {};
|
|
|
bool coopmat_acc_f16_support {};
|
|
|
bool coopmat_bf16_support {};
|
|
|
+ bool coopmat_support_16x16x16_f16acc {};
|
|
|
+ bool coopmat_support_16x16x16_f32acc {};
|
|
|
+ bool coopmat1_fa_support {};
|
|
|
uint32_t coopmat_m;
|
|
|
uint32_t coopmat_n;
|
|
|
uint32_t coopmat_k;
|
|
|
@@ -410,6 +413,13 @@ struct vk_device_struct {
|
|
|
vk_pipeline pipeline_flash_attn_f32_f16_D128_cm2[GGML_TYPE_COUNT][2][2][2];
|
|
|
vk_pipeline pipeline_flash_attn_f32_f16_D256_cm2[GGML_TYPE_COUNT][2][2][2];
|
|
|
|
|
|
+ vk_pipeline pipeline_flash_attn_f32_f16_D64_cm1[GGML_TYPE_COUNT][2][2][2];
|
|
|
+ vk_pipeline pipeline_flash_attn_f32_f16_D80_cm1[GGML_TYPE_COUNT][2][2][2];
|
|
|
+ vk_pipeline pipeline_flash_attn_f32_f16_D96_cm1[GGML_TYPE_COUNT][2][2][2];
|
|
|
+ vk_pipeline pipeline_flash_attn_f32_f16_D112_cm1[GGML_TYPE_COUNT][2][2][2];
|
|
|
+ vk_pipeline pipeline_flash_attn_f32_f16_D128_cm1[GGML_TYPE_COUNT][2][2][2];
|
|
|
+ vk_pipeline pipeline_flash_attn_f32_f16_D256_cm1[GGML_TYPE_COUNT][2][2][2];
|
|
|
+
|
|
|
vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
|
|
|
vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2];
|
|
|
vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2];
|
|
|
@@ -1588,19 +1598,36 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
|
|
);
|
|
|
}
|
|
|
|
|
|
+enum FaCodePath {
|
|
|
+ FA_SCALAR,
|
|
|
+ FA_COOPMAT1,
|
|
|
+ FA_COOPMAT2,
|
|
|
+};
|
|
|
+
|
|
|
// number of rows/cols for flash attention shader
|
|
|
static constexpr uint32_t flash_attention_num_small_rows = 32;
|
|
|
static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
|
|
|
static constexpr uint32_t scalar_flash_attention_num_large_rows = 8;
|
|
|
|
|
|
-static uint32_t get_fa_num_small_rows(bool scalar) {
|
|
|
- return scalar ? scalar_flash_attention_num_small_rows : flash_attention_num_small_rows;
|
|
|
+// The FA coopmat1 shader assumes 16x16x16 matrix multiply support.
|
|
|
+// 128 threads split into four subgroups, each subgroup does 1/4
|
|
|
+// of the Bc dimension.
|
|
|
+static constexpr uint32_t coopmat1_flash_attention_num_large_rows = 16;
|
|
|
+static constexpr uint32_t scalar_flash_attention_Bc = 64;
|
|
|
+static constexpr uint32_t scalar_flash_attention_workgroup_size = 128;
|
|
|
+
|
|
|
+static uint32_t get_fa_num_small_rows(FaCodePath path) {
|
|
|
+ if (path == FA_COOPMAT2) {
|
|
|
+ return flash_attention_num_small_rows;
|
|
|
+ } else {
|
|
|
+ return scalar_flash_attention_num_small_rows;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
-static std::array<uint32_t, 2> fa_rows_cols(bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
|
|
|
+static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
|
|
|
GGML_UNUSED(clamp);
|
|
|
|
|
|
- if (scalar) {
|
|
|
+ if (path == FA_SCALAR) {
|
|
|
if (small_rows) {
|
|
|
return {scalar_flash_attention_num_small_rows, 64};
|
|
|
} else {
|
|
|
@@ -1608,9 +1635,17 @@ static std::array<uint32_t, 2> fa_rows_cols(bool scalar, uint32_t D, uint32_t cl
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ if (path == FA_COOPMAT1) {
|
|
|
+ if (small_rows) {
|
|
|
+ return {scalar_flash_attention_num_small_rows, scalar_flash_attention_Bc};
|
|
|
+ } else {
|
|
|
+ return {coopmat1_flash_attention_num_large_rows, scalar_flash_attention_Bc};
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
// small rows, large cols
|
|
|
if (small_rows) {
|
|
|
- return {get_fa_num_small_rows(scalar), 32};
|
|
|
+ return {get_fa_num_small_rows(FA_COOPMAT2), 32};
|
|
|
}
|
|
|
|
|
|
// small cols to reduce register count
|
|
|
@@ -1907,17 +1942,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
|
|
|
};
|
|
|
|
|
|
- auto const &fa_wg_denoms = [&](bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
|
|
|
- return {fa_rows_cols(scalar, D, clamp, type, small_rows)[0], 1, 1};
|
|
|
+ auto const &fa_wg_denoms = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
|
|
|
+ return {fa_rows_cols(path, D, clamp, type, small_rows)[0], 1, 1};
|
|
|
};
|
|
|
|
|
|
- auto const &fa_spec_constants = [&](bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
|
|
|
+ auto const &fa_spec_constants = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
|
|
|
// For large number of rows, 128 invocations seems to work best.
|
|
|
// For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we
|
|
|
// can't use 256 for D==80.
|
|
|
// For scalar, use 128 (arbitrary)
|
|
|
- uint32_t wg_size = scalar ? 128 : ((small_rows && (D % 32) == 0) ? 256 : 128);
|
|
|
- auto rows_cols = fa_rows_cols(scalar, D, clamp, type, small_rows);
|
|
|
+ uint32_t wg_size = (path == FA_SCALAR || path == FA_COOPMAT1)
|
|
|
+ ? scalar_flash_attention_workgroup_size
|
|
|
+ : ((small_rows && (D % 32) == 0) ? 256 : 128);
|
|
|
+ auto rows_cols = fa_rows_cols(path, D, clamp, type, small_rows);
|
|
|
|
|
|
// D_split can't be larger than a subgroup because we use subgroupShuffle to reduce it.
|
|
|
// D_split can't be larger than the LSB of D divided by 4 due to vectorization in the shader.
|
|
|
@@ -1929,36 +1966,43 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
return {wg_size, rows_cols[0], rows_cols[1], (D), clamp, D_split};
|
|
|
};
|
|
|
|
|
|
-#define CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, D) \
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,false), fa_spec_constants(SCALAR, D,1,TYPE,false), 1, true); \
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,false), fa_spec_constants(SCALAR, D,0,TYPE,false), fa_rows_cols(SCALAR,D,0,TYPE,false)[1], true); \
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,false), fa_spec_constants(SCALAR, D,1,TYPE,false), 1, true); \
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,false), fa_spec_constants(SCALAR, D,0,TYPE,false), fa_rows_cols(SCALAR,D,0,TYPE,false)[1], true); \
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,true), fa_spec_constants(SCALAR, D,1,TYPE,true), 1, true); \
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,true), fa_spec_constants(SCALAR, D,0,TYPE,true), fa_rows_cols(SCALAR,D,0,TYPE,true)[1], true); \
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,true), fa_spec_constants(SCALAR, D,1,TYPE,true), 1, true); \
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,true), fa_spec_constants(SCALAR, D,0,TYPE,true), fa_rows_cols(SCALAR,D,0,TYPE,true)[1], true); \
|
|
|
-
|
|
|
-#define CREATE_FA(TYPE, NAMELC, SCALAR, SUFFIX) \
|
|
|
- CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 64) \
|
|
|
- CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 80) \
|
|
|
- CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 96) \
|
|
|
- CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 112) \
|
|
|
- CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 128) \
|
|
|
- CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 256)
|
|
|
-
|
|
|
- CREATE_FA(GGML_TYPE_F16, f16, true, )
|
|
|
- CREATE_FA(GGML_TYPE_Q4_0, q4_0, true, )
|
|
|
- CREATE_FA(GGML_TYPE_Q8_0, q8_0, true, )
|
|
|
+#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, D) \
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true), fa_rows_cols(FAPATH,D,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true), fa_rows_cols(FAPATH,D,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \
|
|
|
+
|
|
|
+#define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \
|
|
|
+ CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64) \
|
|
|
+ CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 80) \
|
|
|
+ CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 96) \
|
|
|
+ CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 112) \
|
|
|
+ CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 128) \
|
|
|
+ CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 256)
|
|
|
+
|
|
|
+ CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
|
|
|
+ CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
|
|
|
+ CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, )
|
|
|
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
|
|
+ if (device->coopmat1_fa_support) {
|
|
|
+ CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT1, _cm1)
|
|
|
+ CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT1, _cm1)
|
|
|
+ CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT1, _cm1)
|
|
|
+ }
|
|
|
+#endif
|
|
|
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
|
|
if (device->coopmat2) {
|
|
|
- CREATE_FA(GGML_TYPE_F16, f16, false, _cm2)
|
|
|
- CREATE_FA(GGML_TYPE_Q4_0, q4_0, false, _cm2)
|
|
|
- CREATE_FA(GGML_TYPE_Q4_1, q4_1, false, _cm2)
|
|
|
- CREATE_FA(GGML_TYPE_Q5_0, q5_0, false, _cm2)
|
|
|
- CREATE_FA(GGML_TYPE_Q5_1, q5_1, false, _cm2)
|
|
|
- CREATE_FA(GGML_TYPE_Q8_0, q8_0, false, _cm2)
|
|
|
- CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, false, _cm2)
|
|
|
+ CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT2, _cm2)
|
|
|
+ CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT2, _cm2)
|
|
|
+ CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_COOPMAT2, _cm2)
|
|
|
+ CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_COOPMAT2, _cm2)
|
|
|
+ CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT2, _cm2)
|
|
|
+ CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT2, _cm2)
|
|
|
+ CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT2, _cm2)
|
|
|
}
|
|
|
#endif
|
|
|
#undef CREATE_FA2
|
|
|
@@ -2041,17 +2085,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
|
|
#define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
|
|
if (device->mul_mat ## ID ## _l[TYPE]) \
|
|
|
- ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \
|
|
|
+ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \
|
|
|
if (device->mul_mat ## ID ## _m[TYPE]) \
|
|
|
- ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \
|
|
|
+ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \
|
|
|
if (device->mul_mat ## ID ## _s[TYPE]) \
|
|
|
- ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \
|
|
|
+ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \
|
|
|
if (device->mul_mat ## ID ## _l[TYPE]) \
|
|
|
- ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \
|
|
|
+ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \
|
|
|
if (device->mul_mat ## ID ## _m[TYPE]) \
|
|
|
- ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \
|
|
|
+ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \
|
|
|
if (device->mul_mat ## ID ## _s[TYPE]) \
|
|
|
- ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \
|
|
|
+ ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \
|
|
|
|
|
|
// Create 2 variants, {f16,f32} accumulator
|
|
|
#define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
|
|
@@ -3009,6 +3053,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
|
|
|
#if defined(VK_KHR_cooperative_matrix)
|
|
|
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
|
|
+
|
|
|
+ // coopmat1 fa shader currently assumes 32 invocations per subgroup
|
|
|
+ device->coopmat1_fa_support = device->coopmat_support && device->subgroup_require_full_support &&
|
|
|
+ device->subgroup_size_control && device->subgroup_min_size <= 32 &&
|
|
|
+ device->subgroup_max_size >= 32;
|
|
|
#endif
|
|
|
|
|
|
if (coopmat2_support) {
|
|
|
@@ -3143,6 +3192,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
// Only enable if shape is identical
|
|
|
device->coopmat_acc_f32_support = true;
|
|
|
}
|
|
|
+ if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) {
|
|
|
+ device->coopmat_support_16x16x16_f32acc = true;
|
|
|
+ }
|
|
|
} else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 &&
|
|
|
(vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) {
|
|
|
// coopmat sizes not set yet
|
|
|
@@ -3155,6 +3207,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
// Only enable if shape is identical
|
|
|
device->coopmat_acc_f16_support = true;
|
|
|
}
|
|
|
+ if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) {
|
|
|
+ device->coopmat_support_16x16x16_f16acc = true;
|
|
|
+ }
|
|
|
}
|
|
|
} else if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eSint8 &&
|
|
|
(vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eSint8 &&
|
|
|
@@ -5688,6 +5743,36 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t D, bool f32acc) {
|
|
|
+ // Needs to be kept up to date on shader changes
|
|
|
+ const uint32_t wg_size = scalar_flash_attention_workgroup_size;
|
|
|
+ const uint32_t Br = scalar_flash_attention_num_large_rows;
|
|
|
+ const uint32_t Bc = scalar_flash_attention_Bc;
|
|
|
+
|
|
|
+ const uint32_t acctype = f32acc ? 4 : 2;
|
|
|
+ const uint32_t f16vec4 = 8;
|
|
|
+
|
|
|
+ const uint32_t tmpsh = wg_size * sizeof(float);
|
|
|
+ const uint32_t tmpshv4 = wg_size * 4 * acctype;
|
|
|
+
|
|
|
+ const uint32_t Qf = Br * (D / 4 + 2) * f16vec4;
|
|
|
+
|
|
|
+ const uint32_t sfshstride = (D <= 128) ? (Br + 8) : Br;
|
|
|
+ const uint32_t sfsh = Bc * sfshstride * acctype;
|
|
|
+
|
|
|
+ const uint32_t kshstride = D / 4 + 2;
|
|
|
+ const uint32_t ksh = Bc * kshstride * f16vec4;
|
|
|
+
|
|
|
+ const uint32_t slope = Br * sizeof(float);
|
|
|
+
|
|
|
+ const uint32_t total_size = tmpsh + tmpshv4 + Qf + sfsh + ksh + slope;
|
|
|
+ const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;
|
|
|
+
|
|
|
+ VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(D=" << D << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported);
|
|
|
+
|
|
|
+ return supported;
|
|
|
+}
|
|
|
+
|
|
|
static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) {
|
|
|
VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
|
|
|
std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
|
|
|
@@ -5738,7 +5823,19 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
assert(q->type == GGML_TYPE_F32);
|
|
|
assert(k->type == v->type);
|
|
|
|
|
|
- bool scalar = !ctx->device->coopmat2;
|
|
|
+ FaCodePath path = ctx->device->coopmat2 ? FA_COOPMAT2 :
|
|
|
+ ctx->device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR;
|
|
|
+
|
|
|
+ if (path == FA_COOPMAT1) {
|
|
|
+ const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) ||
|
|
|
+ (dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc);
|
|
|
+
|
|
|
+ const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, D, dst->op_params[3] == GGML_PREC_F32);
|
|
|
+
|
|
|
+ if (!coopmat_shape_supported || !coopmat_shmem_supported) {
|
|
|
+ path = FA_SCALAR;
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
uint32_t gqa_ratio = 1;
|
|
|
uint32_t qk_ratio = neq2 / nek2;
|
|
|
@@ -5746,9 +5843,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
uint32_t workgroups_y = (uint32_t)neq2;
|
|
|
uint32_t workgroups_z = (uint32_t)neq3;
|
|
|
|
|
|
- // For scalar FA, we can use the "large" size to accommodate qga.
|
|
|
- // For coopmat FA, we always use the small size (which is still pretty large for gqa).
|
|
|
- const uint32_t max_gqa = scalar ? scalar_flash_attention_num_large_rows : get_fa_num_small_rows(false);
|
|
|
+ // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
|
|
|
+ // For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
|
|
|
+ uint32_t max_gqa;
|
|
|
+ switch (path) {
|
|
|
+ case FA_SCALAR:
|
|
|
+ case FA_COOPMAT1:
|
|
|
+ // We may switch from coopmat1 to scalar, so use the scalar limit for both
|
|
|
+ max_gqa = scalar_flash_attention_num_large_rows;
|
|
|
+ break;
|
|
|
+ case FA_COOPMAT2:
|
|
|
+ max_gqa = get_fa_num_small_rows(FA_COOPMAT2);
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ GGML_ASSERT(0);
|
|
|
+ }
|
|
|
|
|
|
if (N == 1 && qk_ratio > 1 && qk_ratio <= max_gqa &&
|
|
|
qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
|
|
|
@@ -5761,11 +5870,16 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
}
|
|
|
|
|
|
vk_pipeline *pipelines;
|
|
|
- // XXX TODO other backends may be changing accumulator precision to default to f32 soon
|
|
|
- bool f32acc = scalar || dst->op_params[3] == GGML_PREC_F32;
|
|
|
- bool small_rows = N <= get_fa_num_small_rows(scalar);
|
|
|
+ bool small_rows = N <= get_fa_num_small_rows(path);
|
|
|
|
|
|
- if (scalar) {
|
|
|
+ if (small_rows && path == FA_COOPMAT1) {
|
|
|
+ path = FA_SCALAR;
|
|
|
+ }
|
|
|
+
|
|
|
+ bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
|
|
|
+
|
|
|
+ switch (path) {
|
|
|
+ case FA_SCALAR:
|
|
|
switch (D) {
|
|
|
case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break;
|
|
|
case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break;
|
|
|
@@ -5777,7 +5891,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
GGML_ASSERT(!"unsupported D value");
|
|
|
return;
|
|
|
}
|
|
|
- } else {
|
|
|
+ break;
|
|
|
+ case FA_COOPMAT1:
|
|
|
+ switch (D) {
|
|
|
+ case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm1[k->type][f32acc][small_rows][0]; break;
|
|
|
+ case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm1[k->type][f32acc][small_rows][0]; break;
|
|
|
+ case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96_cm1[k->type][f32acc][small_rows][0]; break;
|
|
|
+ case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112_cm1[k->type][f32acc][small_rows][0]; break;
|
|
|
+ case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128_cm1[k->type][f32acc][small_rows][0]; break;
|
|
|
+ case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256_cm1[k->type][f32acc][small_rows][0]; break;
|
|
|
+ default:
|
|
|
+ GGML_ASSERT(!"unsupported D value");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case FA_COOPMAT2:
|
|
|
switch (D) {
|
|
|
case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm2[k->type][f32acc][small_rows][0]; break;
|
|
|
case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm2[k->type][f32acc][small_rows][0]; break;
|
|
|
@@ -5789,6 +5917,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
GGML_ASSERT(!"unsupported D value");
|
|
|
return;
|
|
|
}
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ GGML_ASSERT(0);
|
|
|
}
|
|
|
assert(pipelines);
|
|
|
|