|
@@ -765,6 +765,9 @@ struct vk_device_struct {
|
|
|
vk_pipeline pipeline_topk_f32[num_topk_pipelines];
|
|
vk_pipeline pipeline_topk_f32[num_topk_pipelines];
|
|
|
vk_pipeline pipeline_sum_rows_f32;
|
|
vk_pipeline pipeline_sum_rows_f32;
|
|
|
vk_pipeline pipeline_cumsum_f32;
|
|
vk_pipeline pipeline_cumsum_f32;
|
|
|
|
|
+ vk_pipeline pipeline_cumsum_small_f32;
|
|
|
|
|
+ vk_pipeline pipeline_cumsum_multipass1_f32;
|
|
|
|
|
+ vk_pipeline pipeline_cumsum_multipass2_f32;
|
|
|
vk_pipeline pipeline_argmax_f32;
|
|
vk_pipeline pipeline_argmax_f32;
|
|
|
vk_pipeline pipeline_count_equal_i32;
|
|
vk_pipeline pipeline_count_equal_i32;
|
|
|
std::map<vk_solve_tri_pipeline_state, vk_pipeline> pipeline_solve_tri_f32;
|
|
std::map<vk_solve_tri_pipeline_state, vk_pipeline> pipeline_solve_tri_f32;
|
|
@@ -4178,7 +4181,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
|
|
|
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size }, 1, true, true, device->subgroup_size);
|
|
|
|
|
|
|
+ const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4;
|
|
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size);
|
|
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_cumsum_small_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size, 1 }, 1, true, true, device->subgroup_size);
|
|
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass1_f32, "cumsum_multipass1_f32", cumsum_multipass1_f32_len, cumsum_multipass1_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size);
|
|
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass2_f32, "cumsum_multipass2_f32", cumsum_multipass2_f32_len, cumsum_multipass2_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size);
|
|
|
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1);
|
|
ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1);
|
|
|
|
|
|
|
@@ -8804,7 +8811,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
return nullptr;
|
|
return nullptr;
|
|
|
case GGML_OP_CUMSUM:
|
|
case GGML_OP_CUMSUM:
|
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
|
- return ctx->device->pipeline_cumsum_f32;
|
|
|
|
|
|
|
+ if (src0->ne[0] <= 512) {
|
|
|
|
|
+ return ctx->device->pipeline_cumsum_small_f32;
|
|
|
|
|
+ } else {
|
|
|
|
|
+ return ctx->device->pipeline_cumsum_f32;
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
return nullptr;
|
|
return nullptr;
|
|
|
case GGML_OP_SOLVE_TRI:
|
|
case GGML_OP_SOLVE_TRI:
|
|
@@ -10708,8 +10719,50 @@ static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, cons
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
static void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
|
- vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
|
|
|
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, p);
|
|
|
|
|
|
|
+ vk_op_sum_rows_push_constants pc = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
|
|
|
|
|
+ // Use the single pass shader when the rows are small or there are enough rows to fill the GPU.
|
|
|
|
|
+ // For fewer, larger rows, use the multipass shader to spread each row across SMs.
|
|
|
|
|
+ if (dst->ne[0] <= 4096 || ggml_nrows(dst) >= ctx->device->shader_core_count) {
|
|
|
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CUMSUM, pc);
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // First pass computes partial sums within a block, and stores the last partial
|
|
|
|
|
+ // to the temp buffer. Second pass sums the block partials from the temp buffer
|
|
|
|
|
+ // and adds that to the result of the first pass.
|
|
|
|
|
+ vk_pipeline pipeline1 = ctx->device->pipeline_cumsum_multipass1_f32;
|
|
|
|
|
+ vk_pipeline pipeline2 = ctx->device->pipeline_cumsum_multipass2_f32;
|
|
|
|
|
+ GGML_ASSERT(pipeline1 != nullptr && pipeline2 != nullptr);
|
|
|
|
|
+
|
|
|
|
|
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
|
|
|
|
|
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
|
|
|
|
|
+
|
|
|
|
|
+ std::array<uint32_t, 3> elements;
|
|
|
|
|
+
|
|
|
|
|
+ elements[0] = dst->ne[0];
|
|
|
|
|
+ elements[1] = (uint32_t)ggml_nrows(dst);
|
|
|
|
|
+ elements[2] = 1;
|
|
|
|
|
+
|
|
|
|
|
+ size_t temp_size = sizeof(float) * elements[0] * ggml_nrows(dst);
|
|
|
|
|
+
|
|
|
|
|
+ if (ctx->prealloc_size_split_k < temp_size) {
|
|
|
|
|
+ ctx->prealloc_size_split_k = temp_size;
|
|
|
|
|
+ ggml_vk_preallocate_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src0);
|
|
|
|
|
+ vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
|
|
|
+ vk_subbuffer temp_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
|
|
|
|
|
+
|
|
|
|
|
+ if (ctx->prealloc_split_k_need_sync) {
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, {src_buf, dst_buf, temp_buf}, pc, elements);
|
|
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
|
|
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, {src_buf, dst_buf, temp_buf}, pc, elements);
|
|
|
|
|
+
|
|
|
|
|
+ ctx->prealloc_split_k_need_sync = true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|