|
|
@@ -757,7 +757,8 @@ struct vk_device_struct {
|
|
|
|
|
|
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
|
|
|
|
|
- vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
|
|
|
+ // [2] is for whether to take n_experts from spec constant (0) or push constant (1)
|
|
|
+ vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
|
|
|
|
|
|
std::vector<vk_pipeline_ref> all_pipelines;
|
|
|
|
|
|
@@ -1149,6 +1150,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
|
|
|
|
|
|
struct vk_op_topk_moe_push_constants {
|
|
|
uint32_t n_rows;
|
|
|
+ uint32_t n_experts_push;
|
|
|
uint32_t n_expert_used;
|
|
|
float clamp_min;
|
|
|
float clamp_max;
|
|
|
@@ -4204,10 +4206,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
|
- for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
|
|
|
- ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
|
|
|
- ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
|
|
|
- ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
|
|
|
+ for (uint32_t use_push = 0; use_push < 2; ++use_push) {
|
|
|
+ for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
|
|
|
+ ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
|
|
|
+ ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
|
|
|
+ ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
for (auto &c : compiles) {
|
|
|
@@ -8554,7 +8558,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
|
|
|
GGML_ASSERT(idx < num_topk_moe_pipelines);
|
|
|
topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
|
|
|
- return ctx->device->pipeline_topk_moe[idx][mode];
|
|
|
+ // use n_experts from push constant if it's not equal to the power of two spec constant
|
|
|
+ bool use_push = dst->ne[0] != (1u << idx);
|
|
|
+ return ctx->device->pipeline_topk_moe[idx][mode][use_push];
|
|
|
}
|
|
|
|
|
|
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
|
|
@@ -10158,6 +10164,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
|
|
|
|
vk_op_topk_moe_push_constants pc {};
|
|
|
pc.n_rows = n_rows;
|
|
|
+ pc.n_experts_push = n_experts;
|
|
|
pc.n_expert_used = n_expert_used;
|
|
|
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
|
|
|
ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
|
|
|
@@ -12832,8 +12839,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
|
|
|
}
|
|
|
|
|
|
const int n_expert = softmax->ne[0];
|
|
|
- // n_expert must be a power of 2
|
|
|
- if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
|
|
|
+ if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
|
|
|
return false;
|
|
|
}
|
|
|
|