|
|
@@ -722,6 +722,11 @@ struct vk_device_struct {
|
|
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
|
|
vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
|
|
|
vk_pipeline pipeline_soft_max_back_f32;
|
|
|
+
|
|
|
+ vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16;
|
|
|
+ vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16;
|
|
|
+ vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16;
|
|
|
+
|
|
|
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
|
|
|
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
|
|
|
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
|
|
|
@@ -3998,6 +4003,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
|
|
|
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
|
|
+
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
|
@@ -10117,7 +10129,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
|
|
|
|
- ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, {
|
|
|
+ vk_op_soft_max_push_constants pc {
|
|
|
ncols,
|
|
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
|
|
|
@@ -10128,7 +10140,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
|
n_head_log2,
|
|
|
nrows_x,
|
|
|
src2 != nullptr
|
|
|
- });
|
|
|
+ };
|
|
|
+
|
|
|
+ if (ncols <= 16384) {
|
|
|
+ ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
|
|
|
+ } else {
|
|
|
+
|
|
|
+ vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
|
|
|
+ vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
|
|
|
+ vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
|
|
|
+ vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
|
|
|
+
|
|
|
+ uint32_t elems_per_wg = 128 * 4;
|
|
|
+ uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
|
|
|
+ size_t tmp_size = num_wgs * nrows_x * sizeof(float);
|
|
|
+
|
|
|
+ if (ctx->prealloc_size_x < tmp_size) {
|
|
|
+ ctx->prealloc_size_x = tmp_size;
|
|
|
+ ggml_vk_preallocate_buffers(ctx, subctx);
|
|
|
+ }
|
|
|
+ if (ctx->prealloc_size_y < tmp_size) {
|
|
|
+ ctx->prealloc_size_y = tmp_size;
|
|
|
+ ggml_vk_preallocate_buffers(ctx, subctx);
|
|
|
+ }
|
|
|
+ if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
+ }
|
|
|
+
|
|
|
+ vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
|
|
|
+ vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
|
|
|
+
|
|
|
+ std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
|
|
|
+
|
|
|
+ vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
|
|
|
+ vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
|
|
|
+ vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
|
|
|
+
|
|
|
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
|
|
|
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
|
|
|
+ ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
|
|
|
+
|
|
|
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
|
|
+ ggml_vk_sync_buffers(ctx, subctx);
|
|
|
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
|
|
+
|
|
|
+ ctx->prealloc_x_need_sync = true;
|
|
|
+ ctx->prealloc_y_need_sync = true;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|