1 год назад · ba0c7c70ab
--- a/README.md
+++ b/README.md
@@ -636,15 +636,6 @@ Building the program with BLAS support may lead to some performance improvements
 
															 - #### Vulkan
														
 
															-> [!WARNING]
														
 
															->
														
 
															-> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
														
 
															-> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
														
 
															-> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
														
 
															-> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
														
 
															->
														
 
															-> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
														
 
															-
														
 
															   **With docker**:
														
 
															   You don't need to install Vulkan SDK. It will be installed inside the container.
														
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -11,17 +11,6 @@ extern "C" {
 
															 #define GGML_VK_MAX_DEVICES 16
														
 
															 GGML_API void ggml_vk_instance_init(void);
														
 
															-GGML_API void ggml_vk_init_cpu_assist(void);
														
 
															-
														
 
															-GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
														
 
															-GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
														
 
															-GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
														
 
															-GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
														
 
															-#ifdef GGML_VULKAN_CHECK_RESULTS
														
 
															-void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
														
 
															-#endif
														
 
															-GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
														
 
															-GGML_API void ggml_vk_free_cpu_assist(void);
														
 
															 // backend API
														
 
															 GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
														
--- a/ggml.c
+++ b/ggml.c
@@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 
															 #include <Accelerate/Accelerate.h>
														
 
															 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
														
 
															 #include "ggml-opencl.h"
														
 
															-#elif defined(GGML_USE_VULKAN)
														
 
															-#include "ggml-vulkan.h"
														
 
															 #endif
														
 
															 #elif defined(GGML_USE_OPENBLAS)
														
 
															 #if defined(GGML_BLAS_USE_MKL)
														
@@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 
															 #endif
														
 
															 #elif defined(GGML_USE_CLBLAST)
														
 
															 #include "ggml-opencl.h"
														
 
															-#elif defined(GGML_USE_VULKAN)
														
 
															-#include "ggml-vulkan.h"
														
 
															 #endif
														
 
															 // floating point type used to accumulate sums
														
@@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
															 #if defined(GGML_USE_CLBLAST)
														
 
															         ggml_cl_init();
														
 
															-#elif defined(GGML_USE_VULKAN)
														
 
															-        ggml_vk_init_cpu_assist();
														
 
															 #endif
														
 
															         ggml_setup_op_has_task_pass();
														
@@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
															         return;
														
 
															     }
														
 
															-#if defined(GGML_USE_VULKAN)
														
 
															-    const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
														
 
															-#ifdef GGML_VULKAN_CHECK_RESULTS
														
 
															-    if (skip_cpu) {
														
 
															-        ggml_vk_check_results_1_cpu_assist(params, tensor);
														
 
															-    }
														
 
															-#endif
														
 
															-    if (skip_cpu) {
														
 
															-        return;
														
 
															-    }
														
 
															-    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
														
 
															-    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
														
 
															-#endif // GGML_USE_VULKAN
														
 
															-
														
 
															     switch (tensor->op) {
														
 
															         case GGML_OP_DUP:
														
 
															             {
														
@@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 
															         }
														
 
															     }
														
 
															-#ifdef GGML_USE_VULKAN
														
 
															-    for (int i = 0; i < cgraph->n_nodes; i++) {
														
 
															-        ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
														
 
															-    }
														
 
															-    ggml_vk_preallocate_buffers_cpu_assist();
														
 
															-
														
 
															-    for (int i = 0; i < cgraph->n_nodes; i++) {
														
 
															-        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
														
 
															-    }
														
 
															-#endif
														
 
															-
														
 
															     const int n_threads = cplan->n_threads;
														
 
															     struct ggml_compute_state_shared state_shared = {
														
@@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 
															         }
														
 
															     }
														
 
															-#ifdef GGML_USE_VULKAN
														
 
															-    ggml_vk_graph_cleanup_cpu_assist();
														
 
															-#endif
														
 
															-
														
 
															     // performance stats (graph)
														
 
															     {
														
 
															         int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
														
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@@ -18,6 +18,12 @@ shader_int8_ext = """
 
															 """
														
 
															 # Type-specific defines
														
 
															+shader_f32_defines = """
														
 
															+#define QUANT_K 1
														
 
															+#define QUANT_R 1
														
 
															+
														
 
															+#define A_TYPE float
														
 
															+"""
														
 
															 shader_f16_defines = """
														
 
															 #define QUANT_K 1
														
 
															 #define QUANT_R 1
														
@@ -157,8 +163,8 @@ struct block_q6_K
 
															 """
														
 
															 # Dequant functions
														
 
															-shader_f16_dequant_func = """
														
 
															-#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
														
 
															+shader_float_dequant_func = """
														
 
															+#define DEQUANT_FUNC vec2 v = vec2(ib, ib);  // data_a[ib], data_a[ib + 1]);
														
 
															 """
														
 
															 shader_q4_0_dequant_func = """
														
@@ -410,6 +416,133 @@ mulmat_load_q8_0 = """
 
															             buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
														
 
															             buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);"""
														
 
															+
														
 
															+mulmat_load_q2_K = """
														
 
															+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
														
 
															+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
														
 
															+
														
 
															+            const uint ib = idx / 128;                         // 2 values per idx
														
 
															+            const uint iqs = idx % 128;                        // 0..127
														
 
															+
														
 
															+            const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
														
 
															+            const uint scalesi = iqs / 8;                      // 0..15
														
 
															+            const uint qsshift = ((iqs % 64) / 16) * 2;        // 0,2,4,6
														
 
															+
														
 
															+            const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
														
 
															+            const uint scales = data_a[ib].scales[scalesi];
														
 
															+            const vec2 d = vec2(data_a[ib].d);
														
 
															+
														
 
															+            const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
														
 
															+
														
 
															+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
														
 
															+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);"""
														
 
															+
														
 
															+mulmat_load_q3_K = """
														
 
															+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
														
 
															+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
														
 
															+
														
 
															+            const uint ib = idx / 128;                   // 2 values per idx
														
 
															+            const uint iqs = idx % 128;                  // 0..127
														
 
															+
														
 
															+            const uint n = iqs / 64;                     // 0,1
														
 
															+            const uint qsi = n * 32 + (iqs % 16) * 2;    // 0,2,4..62
														
 
															+            const uint hmi =          (iqs % 16) * 2;    // 0,2,4..30
														
 
															+            const uint j = (iqs % 64) / 4;               // 0..3
														
 
															+            const uint is = iqs / 8;                     // 0..15
														
 
															+            const uint halfsplit = ((iqs % 64) / 16);    // 0,1,2,3
														
 
															+            const uint qsshift = halfsplit * 2;          // 0,2,4,6
														
 
															+            const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128
														
 
															+
														
 
															+            const int8_t us = int8_t(is <  4 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+8] >> 0) & 3) << 4) :
														
 
															+                                    is <  8 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+4] >> 2) & 3) << 4) :
														
 
															+                                    is < 12 ? (data_a[ib].scales[is-8] >>  4) | (((data_a[ib].scales[is+0] >> 4) & 3) << 4) :
														
 
															+                                            (data_a[ib].scales[is-8] >>  4) | (((data_a[ib].scales[is-4] >> 6) & 3) << 4));
														
 
															+            const float dl = float(data_a[ib].d) * float(us - 32);
														
 
															+
														
 
															+            buf_a[buf_idx    ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi    ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi    ] & m) != 0) ? 0 : 4)));
														
 
															+            buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));"""
														
 
															+
														
 
															+mulmat_load_q4_K = """
														
 
															+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
														
 
															+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
														
 
															+
														
 
															+            const uint ib = idx / 128;                 // 2 values per idx
														
 
															+            const uint iqs = idx % 128;                // 0..127
														
 
															+
														
 
															+            const uint n = iqs / 32;                   // 0,1,2,3
														
 
															+            const uint b = (iqs % 32) / 16;            // 0,1
														
 
															+            const uint is = 2 * n + b;                 // 0..7
														
 
															+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
														
 
															+
														
 
															+            const vec2 loadd = vec2(data_a[ib].d);
														
 
															+
														
 
															+            uint8_t sc;
														
 
															+            uint8_t mbyte;
														
 
															+            if (is < 4) {
														
 
															+                sc    = uint8_t(data_a[ib].scales[is    ] & 63);
														
 
															+                mbyte = uint8_t(data_a[ib].scales[is + 4] & 63);
														
 
															+            } else {
														
 
															+                sc    = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4));
														
 
															+                mbyte = uint8_t((data_a[ib].scales[is + 4] >>  4) | ((data_a[ib].scales[is    ] >> 6) << 4));
														
 
															+            }
														
 
															+            const float d = loadd.x * sc;
														
 
															+            const float m = loadd.y * mbyte;
														
 
															+
														
 
															+            buf_a[buf_idx    ] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) - m);
														
 
															+            buf_a[buf_idx + 1] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) - m);"""
														
 
															+
														
 
															+mulmat_load_q5_K = """
														
 
															+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
														
 
															+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
														
 
															+
														
 
															+            const uint ib = idx / 128;                 // 2 values per idx
														
 
															+            const uint iqs = idx % 128;                // 0..127
														
 
															+
														
 
															+            const uint n = iqs / 32;                   // 0,1,2,3
														
 
															+            const uint b = (iqs % 32) / 16;            // 0,1
														
 
															+            const uint is = 2 * n + b;                 // 0..7
														
 
															+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
														
 
															+            const uint qhi = (iqs % 16) * 2;           // 0,2,4..30
														
 
															+
														
 
															+            const uint8_t hm = uint8_t(1 << (iqs / 16));
														
 
															+
														
 
															+            const vec2 loadd = vec2(data_a[ib].d);
														
 
															+
														
 
															+            uint8_t sc;
														
 
															+            uint8_t mbyte;
														
 
															+            if (is < 4) {
														
 
															+                sc    = uint8_t(data_a[ib].scales[is    ] & 63);
														
 
															+                mbyte = uint8_t(data_a[ib].scales[is + 4] & 63);
														
 
															+            } else {
														
 
															+                sc    = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4));
														
 
															+                mbyte = uint8_t((data_a[ib].scales[is + 4] >>  4) | ((data_a[ib].scales[is    ] >> 6) << 4));
														
 
															+            }
														
 
															+            const float d = loadd.x * sc;
														
 
															+            const float m = loadd.y * mbyte;
														
 
															+
														
 
															+            buf_a[buf_idx    ] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi    ] & hm) != 0 ? 16 : 0)) - m);
														
 
															+            buf_a[buf_idx + 1] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0)) - m);"""
														
 
															+
														
 
															+mulmat_load_q6_K = """
														
 
															+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
														
 
															+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
														
 
															+
														
 
															+            const uint ib = idx / 128;                  // 2 values per idx
														
 
															+            const uint iqs = idx % 128;                 // 0..127
														
 
															+
														
 
															+            const uint n = iqs / 64;                    // 0,1
														
 
															+            const uint b = (iqs % 64) / 32;             // 0,1
														
 
															+            const uint is_b = (iqs % 16) / 8;           // 0,1
														
 
															+            const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
														
 
															+            const uint is = 8 * n + qhshift + is_b;     // 0..15
														
 
															+            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
														
 
															+            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
														
 
															+
														
 
															+            const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
														
 
															+
														
 
															+            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
														
 
															+            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));"""
														
 
															+
														
 
															 mulmat_body2 = """
														
 
															         }
														
 
															         [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
														
@@ -1611,8 +1744,9 @@ layout (push_constant) uniform parameter
 
															     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
														
 
															     uint d_offset;
														
 
															     float param1; float param2;
														
 
															-} p;
														
 
															+} p;"""
														
 
															+generic_unary_op_funcs = """
														
 
															 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
														
 
															 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
														
@@ -1636,14 +1770,17 @@ uint dst_idx(uint idx) {
 
															     const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
														
 
															     const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
														
 
															     return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
														
 
															-}
														
 
															+}"""
														
 
															+generic_unary_op_main = """
														
 
															 void main() {
														
 
															     if (gl_GlobalInvocationID.x >= p.ne) {
														
 
															         return;
														
 
															     }
														
 
															 """
														
 
															+generic_unary_op_combined = f"{generic_unary_op_head}\n{generic_unary_op_funcs}\n{generic_unary_op_main}"
														
 
															+
														
 
															 generic_binary_op_head = """#version 450
														
 
															 #extension GL_EXT_shader_16bit_storage : require
														
@@ -1655,13 +1792,14 @@ layout (push_constant) uniform parameter
 
															     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
														
 
															     uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
														
 
															     uint d_offset;
														
 
															-    uint param1; uint param2;
														
 
															-} p;
														
 
															+    float param1; float param2;
														
 
															+} p;"""
														
 
															+generic_binary_op_funcs = """
														
 
															 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
														
 
															 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
														
 
															-layout (binding = 1) readonly buffer B {A_TYPE data_b[];};
														
 
															+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
														
 
															 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
														
 
															 uint src0_idx(uint idx) {
														
@@ -1693,14 +1831,17 @@ uint dst_idx(uint idx) {
 
															     const uint i21 = (idx - i23_offset - i22_offset) / p.ne20;
														
 
															     const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20;
														
 
															     return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20;
														
 
															-}
														
 
															+}"""
														
 
															+generic_binary_op_main = """
														
 
															 void main() {
														
 
															     if (gl_GlobalInvocationID.x >= p.ne) {
														
 
															         return;
														
 
															     }
														
 
															 """
														
 
															+generic_binary_op_combined = f"{generic_binary_op_head}\n{generic_binary_op_funcs}\n{generic_binary_op_main}"
														
 
															+
														
 
															 # MUL F32
														
 
															 mul_body = """
														
 
															     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
														
@@ -1745,39 +1886,55 @@ cpy_f16_f16_end = """
 
															 """
														
 
															 # GET_ROWS
														
 
															-get_rows_body = """
														
 
															-#extension GL_EXT_control_flow_attributes : enable
														
 
															-#extension GL_EXT_shader_8bit_storage : require
														
 
															+get_rows_float_body = """
														
 
															+void main() {
														
 
															+    const uint i00 = gl_GlobalInvocationID.x;
														
 
															+    const uint i10 = gl_GlobalInvocationID.y;
														
 
															+    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
														
 
															+    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
														
 
															-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
														
 
															+    if (i00 >= p.ne00) {
														
 
															+        return;
														
 
															+    }
														
 
															-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
														
 
															-layout (binding = 1) readonly buffer Y {int data_b[];};
														
 
															-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
														
 
															+    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
														
 
															+
														
 
															+    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
														
 
															+    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
														
 
															+#ifndef OPTIMIZATION_ERROR_WORKAROUND
														
 
															+    data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
														
 
															+#else
														
 
															+    data_d[d_offset + i00] = data_a[a_offset + i00];
														
 
															+#endif
														
 
															+}
														
 
															+"""
														
 
															+
														
 
															+get_rows_body = """
														
 
															 void main() {
														
 
															-    const uint col = int(gl_GlobalInvocationID.x) * 2;
														
 
															-    const uint row = int(gl_GlobalInvocationID.y);
														
 
															+    const uint i00 = (gl_GlobalInvocationID.x)*2;
														
 
															+    const uint i10 = gl_GlobalInvocationID.y;
														
 
															+    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
														
 
															+    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
														
 
															-    if (col >= p.KY) {
														
 
															+    if (i00 >= p.ne00) {
														
 
															         return;
														
 
															     }
														
 
															-    const uint r = uint(data_b[row]);
														
 
															+    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
														
 
															-    // copy data_a[r*p.KY + col] to dst[row*p.KX + col]
														
 
															-    const uint xi = r*p.KY + col;
														
 
															-    const uint di = row*p.KY + col;
														
 
															+    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
														
 
															+    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
														
 
															-    const uint ib = xi/QUANT_K; // block index
														
 
															-    const uint iqs = (xi%QUANT_K)/QUANT_R; // quant index
														
 
															-    const uint iybs = di - di%QUANT_K; // y block start index
														
 
															+    const uint ib = a_offset + i00/QUANT_K; // block index
														
 
															+    const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
														
 
															+    const uint iybs = i00 - i00%QUANT_K; // dst block start index
														
 
															     const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
														
 
															     DEQUANT_FUNC
														
 
															-    dst[iybs + iqs + 0]        = D_TYPE(v.x);
														
 
															-    dst[iybs + iqs + y_offset] = D_TYPE(v.y);
														
 
															+    data_d[d_offset + iybs + iqs           ] = D_TYPE(v.x);
														
 
															+    data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
														
 
															 }
														
 
															 """
														
@@ -2418,6 +2575,31 @@ async def main():
 
															         tasks.append(string_to_spv("matmul_q8_0_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q8_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
														
 
															         tasks.append(string_to_spv("matmul_q8_0_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q8_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
														
 
															+        stream.clear()
														
 
															+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q2_K_defines, mulmat_body1, mulmat_load_q2_K, mulmat_body2))
														
 
															+        tasks.append(string_to_spv("matmul_q2_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q2_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
														
 
															+        tasks.append(string_to_spv("matmul_q2_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q2_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
														
 
															+
														
 
															+        stream.clear()
														
 
															+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q3_K_defines, mulmat_body1, mulmat_load_q3_K, mulmat_body2))
														
 
															+        tasks.append(string_to_spv("matmul_q3_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q3_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
														
 
															+        tasks.append(string_to_spv("matmul_q3_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q3_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
														
 
															+
														
 
															+        stream.clear()
														
 
															+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_K_defines, mulmat_body1, mulmat_load_q4_K, mulmat_body2))
														
 
															+        tasks.append(string_to_spv("matmul_q4_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q4_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
														
 
															+        tasks.append(string_to_spv("matmul_q4_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q4_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
														
 
															+
														
 
															+        stream.clear()
														
 
															+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q5_K_defines, mulmat_body1, mulmat_load_q5_K, mulmat_body2))
														
 
															+        tasks.append(string_to_spv("matmul_q5_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q5_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
														
 
															+        tasks.append(string_to_spv("matmul_q5_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q5_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
														
 
															+
														
 
															+        stream.clear()
														
 
															+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q6_K_defines, mulmat_body1, mulmat_load_q6_K, mulmat_body2))
														
 
															+        tasks.append(string_to_spv("matmul_q6_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q6_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
														
 
															+        tasks.append(string_to_spv("matmul_q6_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q6_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
														
 
															+
														
 
															     # Shaders where precision is needed, so no fp16 version
														
 
															     # mul mat vec
														
@@ -2426,7 +2608,7 @@ async def main():
 
															         stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
														
 
															         if i == GGML_TYPE_F16:
														
 
															-            stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
														
 
															+            stream.extend((shader_f16_defines, shader_float_dequant_func, mul_mat_vec_body))
														
 
															         elif i == GGML_TYPE_Q4_0:
														
 
															             stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
														
 
															         elif i == GGML_TYPE_Q4_1:
														
@@ -2488,25 +2670,32 @@ async def main():
 
															     # get_rows
														
 
															     for i in range(0, VK_NUM_TYPES):
														
 
															         stream.clear()
														
 
															-        stream.extend((generic_head, shader_int8_ext, shader_f32))
														
 
															+        stream.extend((generic_binary_op_head, shader_int8_ext, shader_f32))
														
 
															+        optimization_workaround = False
														
 
															-        if i == GGML_TYPE_F16:
														
 
															-            stream.extend((shader_f16_defines,  shader_f16_dequant_func,  get_rows_body))
														
 
															+        if i == GGML_TYPE_F32:
														
 
															+            stream.extend((shader_f32_defines, generic_binary_op_funcs, get_rows_float_body))
														
 
															+        elif i == GGML_TYPE_F16:
														
 
															+            stream.extend((shader_f16_defines, generic_binary_op_funcs, get_rows_float_body))
														
 
															+            optimization_workaround = True
														
 
															         elif i == GGML_TYPE_Q4_0:
														
 
															-            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
														
 
															+            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, generic_binary_op_funcs, get_rows_body))
														
 
															         elif i == GGML_TYPE_Q4_1:
														
 
															-            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
														
 
															+            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, generic_binary_op_funcs, get_rows_body))
														
 
															         elif i == GGML_TYPE_Q5_0:
														
 
															-            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
														
 
															+            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, generic_binary_op_funcs, get_rows_body))
														
 
															         elif i == GGML_TYPE_Q5_1:
														
 
															-            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
														
 
															+            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, generic_binary_op_funcs, get_rows_body))
														
 
															         elif i == GGML_TYPE_Q8_0:
														
 
															-            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
														
 
															+            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, generic_binary_op_funcs, get_rows_body))
														
 
															         else:
														
 
															             continue
														
 
															-        tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
														
 
															-        tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
														
 
															+        if optimization_workaround:
														
 
															+            tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
														
 
															+        else:
														
 
															+            tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t"}))
														
 
															+        tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float"}))
														
 
															     tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
														
 
															     tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
														
@@ -2515,20 +2704,20 @@ async def main():
 
															     tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
														
 
															     tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
														
 
															-    tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
														
 
															-    tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
														
 
															-    tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_head}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
														
 
															+    tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
														
 
															+    tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
														
 
															+    tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_combined}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
														
 
															-    tasks.append(string_to_spv("add_f32", f"{generic_binary_op_head}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															+    tasks.append(string_to_spv("add_f32", f"{generic_binary_op_combined}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															     tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
														
 
															-    tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_head}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															+    tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_combined}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															-    tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_head}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															+    tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_combined}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															-    tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_head}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															+    tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_combined}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															-    tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_head}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															+    tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_combined}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
														
 
															     tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
														
 
															     tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
														
--- a/llama.cpp
+++ b/llama.cpp
@@ -2121,10 +2121,6 @@ struct llama_context {
 
															             ggml_backend_free(backend);
														
 
															         }
														
 
															-#ifdef GGML_USE_VULKAN
														
 
															-        ggml_vk_free_cpu_assist();
														
 
															-#endif
														
 
															-
														
 
															         ggml_backend_buffer_free(buf_output);
														
 
															     }
														
@@ -14131,7 +14127,20 @@ struct llama_context * llama_new_context_with_model(
 
															             }
														
 
															         }
														
 
															 #elif defined(GGML_USE_VULKAN)
														
 
															-        if (model->n_gpu_layers > 0) {
														
 
															+        if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
														
 
															+            LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
														
 
															+            llama_free(ctx);
														
 
															+            return nullptr;
														
 
															+        }
														
 
															+        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
														
 
															+            ggml_backend_t backend = ggml_backend_vk_init(0);
														
 
															+            if (backend == nullptr) {
														
 
															+                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
														
 
															+                llama_free(ctx);
														
 
															+                return nullptr;
														
 
															+            }
														
 
															+            ctx->backends.push_back(backend);
														
 
															+        } else {
														
 
															             for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
														
 
															                 ggml_backend_t backend = ggml_backend_vk_init(device);
														
 
															                 if (backend == nullptr) {