6 months ago · b3ad3a0191
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -437,6 +437,7 @@ struct vk_device_struct {
 
				     vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
			
 
				     vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
			
 
				     vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
			
 
				+    vk_pipeline pipeline_set_rows[GGML_TYPE_COUNT];
			
 
				     vk_pipeline pipeline_norm_f32;
			
 
				     vk_pipeline pipeline_group_norm_f32;
			
 
				     vk_pipeline pipeline_rms_norm_f32;
			
@@ -2749,19 +2750,41 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
				     ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
			
 
				 
			
 
				     if (device->float_controls_rte_fp16) {
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				     } else {
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
			
 
				-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
			
 
				+    }
			
 
				+
			
 
				+    if (device->float_controls_rte_fp16) {
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F32],  "set_rows_f32",  set_rows_f32_rte_len,  set_rows_f32_rte_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F16],  "set_rows_f16",  set_rows_f16_rte_len,  set_rows_f16_rte_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_BF16], "set_rows_bf16", set_rows_bf16_rte_len, set_rows_bf16_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_0], "set_rows_q4_0", set_rows_q4_0_rte_len, set_rows_q4_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_1], "set_rows_q4_1", set_rows_q4_1_rte_len, set_rows_q4_1_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_0], "set_rows_q5_0", set_rows_q5_0_rte_len, set_rows_q5_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_1], "set_rows_q5_1", set_rows_q5_1_rte_len, set_rows_q5_1_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q8_0], "set_rows_q8_0", set_rows_q8_0_rte_len, set_rows_q8_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_IQ4_NL], "set_rows_iq4_nl", set_rows_iq4_nl_rte_len, set_rows_iq4_nl_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+    } else {
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F32],  "set_rows_f32",  set_rows_f32_len,  set_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F16],  "set_rows_f16",  set_rows_f16_len,  set_rows_f16_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_BF16], "set_rows_bf16", set_rows_bf16_len, set_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_0], "set_rows_q4_0", set_rows_q4_0_len, set_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_1], "set_rows_q4_1", set_rows_q4_1_len, set_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_0], "set_rows_q5_0", set_rows_q5_0_len, set_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_1], "set_rows_q5_1", set_rows_q5_1_len, set_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q8_0], "set_rows_q8_0", set_rows_q8_0_len, set_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				+        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_IQ4_NL], "set_rows_iq4_nl", set_rows_iq4_nl_len, set_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
			
 
				     }
			
 
				 
			
 
				     ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
			
@@ -6527,6 +6550,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
 
				     case GGML_OP_CONT:
			
 
				     case GGML_OP_DUP:
			
 
				         return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
			
 
				+    case GGML_OP_SET_ROWS:
			
 
				+        return ctx->device->pipeline_set_rows[dst->type];
			
 
				     case GGML_OP_SILU_BACK:
			
 
				         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
			
 
				             return ctx->device->pipeline_silu_back_f32;
			
@@ -6765,6 +6790,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
 
				     case GGML_OP_RMS_NORM:
			
 
				     case GGML_OP_CONV_2D_DW:
			
 
				     case GGML_OP_IM2COL:
			
 
				+    case GGML_OP_SET_ROWS:
			
 
				         return true;
			
 
				     default:
			
 
				         return false;
			
@@ -7078,6 +7104,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
 
				                     ne *= ggml_type_size(src0->type) / 2;
			
 
				                 }
			
 
				             }
			
 
				+            // copy_to_quant has block size of 32, and each thread does QUANT_K elements.
			
 
				+            // Splitting into 512x512xZ wouldn't work well since each workgroup does 1024 elements.
			
 
				+            // So divide by block size here before splitting into 512x512 groups.
			
 
				+            if (op == GGML_OP_CPY && !ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
			
 
				+                ne = CEIL_DIV(ne, ggml_blck_size(dst->type));
			
 
				+            }
			
 
				             if (ne > 262144) {
			
 
				                 elements = { 512, 512, CEIL_DIV(ne, 262144) };
			
 
				             } else if (ne > 512) {
			
@@ -7086,6 +7118,25 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
 
				                 elements = { ne, 1, 1 };
			
 
				             }
			
 
				         } break;
			
 
				+    case GGML_OP_SET_ROWS:
			
 
				+        {
			
 
				+            uint32_t ne = ggml_nelements(src0);
			
 
				+            if (ggml_is_quantized(dst->type)) {
			
 
				+                // quants run 32 threads each doing QUANT_K elements
			
 
				+                ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type));
			
 
				+            } else {
			
 
				+                // scalar types do one element per thread, running 512 threads
			
 
				+                ne = CEIL_DIV(ne, 512);
			
 
				+            }
			
 
				+            if (ne > 262144) {
			
 
				+                elements = { 512, 512, CEIL_DIV(ne, 262144) };
			
 
				+            } else if (ne > 512) {
			
 
				+                elements = { 512, CEIL_DIV(ne, 512), 1 };
			
 
				+            } else {
			
 
				+                elements = { ne, 1, 1 };
			
 
				+            }
			
 
				+        }
			
 
				+        break;
			
 
				     default:
			
 
				         elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
			
 
				         break;
			
@@ -7648,6 +7699,21 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
 
				     }, dryrun);
			
 
				 }
			
 
				 
			
 
				+static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
			
 
				+    const uint32_t src0_type_size = ggml_type_size(src0->type);
			
 
				+    const uint32_t src1_type_size = ggml_type_size(src1->type);
			
 
				+    const uint32_t dst_type_size = ggml_type_size(dst->type);
			
 
				+
			
 
				+    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, {
			
 
				+        (uint32_t)ggml_nelements(src0),
			
 
				+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
			
 
				+        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
			
 
				+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
			
 
				+        0,
			
 
				+        0.0f, 0.0f, 0,
			
 
				+    }, dryrun);
			
 
				+}
			
 
				+
			
 
				 static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
			
 
				     ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
			
 
				 }
			
@@ -8968,6 +9034,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
 
				     case GGML_OP_CLAMP:
			
 
				     case GGML_OP_PAD:
			
 
				     case GGML_OP_CPY:
			
 
				+    case GGML_OP_SET_ROWS:
			
 
				     case GGML_OP_CONT:
			
 
				     case GGML_OP_DUP:
			
 
				     case GGML_OP_SILU_BACK:
			
@@ -9034,6 +9101,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
 
				         case GGML_OP_CLAMP:
			
 
				         case GGML_OP_PAD:
			
 
				         case GGML_OP_CPY:
			
 
				+        case GGML_OP_SET_ROWS:
			
 
				         case GGML_OP_CONT:
			
 
				         case GGML_OP_DUP:
			
 
				         case GGML_OP_SILU_BACK:
			
@@ -9142,6 +9210,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
 
				     case GGML_OP_DUP:
			
 
				         ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);
			
 
				 
			
 
				+        break;
			
 
				+    case GGML_OP_SET_ROWS:
			
 
				+        ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node, dryrun);
			
 
				+
			
 
				         break;
			
 
				     case GGML_OP_SILU_BACK:
			
 
				         ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun);
			
@@ -9357,6 +9429,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
 
				     case GGML_OP_CLAMP:
			
 
				     case GGML_OP_PAD:
			
 
				     case GGML_OP_CPY:
			
 
				+    case GGML_OP_SET_ROWS:
			
 
				     case GGML_OP_CONT:
			
 
				     case GGML_OP_DUP:
			
 
				     case GGML_OP_SILU_BACK:
			
@@ -10422,9 +10495,20 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
 
				             } break;
			
 
				         case GGML_OP_SET_ROWS:
			
 
				             {
			
 
				-                // TODO: add support
			
 
				-                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
			
 
				-                return false;
			
 
				+                switch (op->type) {
			
 
				+                    case GGML_TYPE_F32:
			
 
				+                    case GGML_TYPE_F16:
			
 
				+                    case GGML_TYPE_BF16:
			
 
				+                    case GGML_TYPE_Q4_0:
			
 
				+                    case GGML_TYPE_Q4_1:
			
 
				+                    case GGML_TYPE_Q5_0:
			
 
				+                    case GGML_TYPE_Q5_1:
			
 
				+                    case GGML_TYPE_Q8_0:
			
 
				+                    case GGML_TYPE_IQ4_NL:
			
 
				+                        return true;
			
 
				+                    default:
			
 
				+                        return false;
			
 
				+                }
			
 
				             } break;
			
 
				         case GGML_OP_CONT:
			
 
				         case GGML_OP_CPY:
			
@@ -11039,6 +11123,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
 
				         } else {
			
 
				             tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				         }
			
 
				+    } else if (tensor->op == GGML_OP_SET_ROWS) {
			
 
				+        tensor_clone = ggml_set_rows(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				     } else if (tensor->op == GGML_OP_CONT) {
			
 
				         tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
			
 
				     } else if (tensor->op == GGML_OP_RESHAPE) {
			
--- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
@@ -6,17 +6,25 @@ spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bi
 
				 #endif // RTE16
			
 
				 
			
 
				 #include "types.comp"
			
 
				-#include "generic_unary_head.comp"
			
 
				 
			
 
				-#if defined(DATA_A_IQ4_NL)
			
 
				-// 16 invocations needed for init_iq4nl_shmem
			
 
				-layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
			
 
				+#if defined(SET_ROWS) && QUANT_K == 1
			
 
				+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
			
 
				+const uint BLOCK_SIZE = 512;
			
 
				 #else
			
 
				-layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
			
 
				+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
			
 
				+const uint BLOCK_SIZE = 32;
			
 
				 #endif
			
 
				 
			
 
				 layout (binding = 0) readonly buffer S {float data_s[];};
			
 
				+
			
 
				+#if defined(SET_ROWS)
			
 
				+#include "generic_binary_head.comp"
			
 
				+layout (binding = 1) readonly buffer C {uvec2 data_i[];};
			
 
				+layout (binding = 2) writeonly buffer Q {A_TYPE data_q[];};
			
 
				+#else
			
 
				+#include "generic_unary_head.comp"
			
 
				 layout (binding = 1) writeonly buffer Q {A_TYPE data_q[];};
			
 
				+#endif
			
 
				 
			
 
				 #if defined(DATA_A_Q4_0)
			
 
				 void quantize(uint dst_idx, uint src_idx)
			
@@ -221,15 +229,56 @@ void quantize(uint dst_idx, uint src_idx)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#if defined(DATA_A_F32) || defined(DATA_A_F16)
			
 
				+void quantize(uint dst_idx, uint src_idx)
			
 
				+{
			
 
				+    data_q[dst_idx] = A_TYPE(data_s[src_idx]);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(DATA_A_BF16)
			
 
				+void quantize(uint dst_idx, uint src_idx)
			
 
				+{
			
 
				+    data_q[dst_idx] = A_TYPE(fp32_to_bf16(data_s[src_idx]));
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(SET_ROWS)
			
 
				+
			
 
				 void main() {
			
 
				 #ifdef NEEDS_INIT_IQ_SHMEM
			
 
				     init_iq_shmem(gl_WorkGroupSize);
			
 
				-    if (gl_LocalInvocationIndex.x != 0) {
			
 
				+#endif
			
 
				+
			
 
				+    const uint idx = ((gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x) * BLOCK_SIZE + gl_LocalInvocationID.x) * QUANT_K;
			
 
				+
			
 
				+    if (idx >= p.ne) {
			
 
				         return;
			
 
				     }
			
 
				+
			
 
				+    uint i00, i01, i02, i03;
			
 
				+    get_indices(idx, i00, i01, i02, i03);
			
 
				+
			
 
				+    uint i12 = fastmod(i03, p.ne12);
			
 
				+    uint i11 = fastmod(i02, p.ne11);
			
 
				+    uint i10 = i01;
			
 
				+
			
 
				+    uint i1 = data_i[src1_idx(i10, i11, i12, 0) + get_boffset()].x;
			
 
				+
			
 
				+    uint src0_idx = src0_idx(i00, i01, i02, i03) + get_aoffset();
			
 
				+    uint dst_idx = dst_idx(i00 / QUANT_K, i1, i02, i03) + get_doffset();
			
 
				+
			
 
				+    quantize(dst_idx, src0_idx);
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+void main() {
			
 
				+#ifdef NEEDS_INIT_IQ_SHMEM
			
 
				+    init_iq_shmem(gl_WorkGroupSize);
			
 
				 #endif
			
 
				 
			
 
				-    const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
			
 
				+    const uint idx = (gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x) * QUANT_K;
			
 
				 
			
 
				     if (idx >= p.ne) {
			
 
				         return;
			
@@ -240,3 +289,5 @@ void main() {
 
				 
			
 
				     quantize(dst_idx, src_idx);
			
 
				 }
			
 
				+
			
 
				+#endif
			
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -518,6 +518,11 @@ void process_shaders() {
 
				         string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
			
 
				     }
			
 
				 
			
 
				+    for (std::string t : {"f32", "f16", "bf16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
			
 
				+        string_to_spv("set_rows_" + t, "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
			
 
				+        string_to_spv("set_rows_" + t + "_rte", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
			
 
				+    }
			
 
				+
			
 
				     auto get_type_str = [](bool f16) {
			
 
				         return f16 ? "float16_t" : "float";
			
 
				     };