1 month ago · 9bf20d8ac3
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1565,7 +1565,7 @@ class vk_perf_logger {
 
				                 total_op_times += time;
			
 
				             }
			
 
				             std::cerr << t.first << ": " << t.second.size() << " x " << (total_op_times / t.second.size() / 1000.0)
			
 
				-                      << " us";
			
 
				+                      << " us = " << (total_op_times / 1000.0) << " us";
			
 
				 
			
 
				             // If we have as many flops entries as timing entries for the op, then compute and log the flops/S.
			
 
				             auto it = flops.find(t.first);
			
@@ -2830,9 +2830,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
				         s_mmq_wg_denoms_k = { 32,  64,  1 };
			
 
				 
			
 
				         // spec constants and tile sizes for quant matmul_id
			
 
				-        l_warptile_mmqid = { 256, 128, 128, 16, 1, device->subgroup_size };
			
 
				-        m_warptile_mmqid = { 256, 128, 64, 16, 0, device->subgroup_size };
			
 
				-        s_warptile_mmqid = { 256, 128, 64, 16, 0, device->subgroup_size };
			
 
				+        l_warptile_mmqid = { 256, 128, 128, 32, 1, device->subgroup_size };
			
 
				+        m_warptile_mmqid = { 256, 128, 64, 32, 0, device->subgroup_size };
			
 
				+        s_warptile_mmqid = { 256, 128, 64, 32, 0, device->subgroup_size };
			
 
				         l_mmqid_wg_denoms = { 128, 128, 1 };
			
 
				         m_mmqid_wg_denoms = { 128, 64, 1 };
			
 
				         s_mmqid_wg_denoms = { 128, 64, 1 };