Просмотр исходного кода

Vulkan k-quant mmq and ggml-backend offload functionality (#6155)

* Fix Vulkan no kv offload incoherence

* Add k-quant mul mat mat shaders

* Rework working buffer allocation, reduces vram use noticeably

Clean up cpu assist code, replaced with ggml-backend offload function

* Default to all dedicated GPUs

* Add fallback for integrated GPUs if no dedicated GPUs are found

* Add debug info which device is allocating memory

* Fix Intel dequant issue

Fix validation issue

* Fix Vulkan GGML_OP_GET_ROWS implementation

* Clean up merge artifacts

* Remove Vulkan warning
0cc4m 1 год назад
Родитель
Сommit
ba0c7c70ab
7 измененных файлов с 1231 добавлено и 704 удалено
  1. 0 9
      README.md
  2. 696 312
      ggml-vulkan-shaders.hpp
  3. 286 286
      ggml-vulkan.cpp
  4. 0 11
      ggml-vulkan.h
  5. 0 35
      ggml.c
  6. 235 46
      ggml_vk_generate_shaders.py
  7. 14 5
      llama.cpp

+ 0 - 9
README.md

@@ -636,15 +636,6 @@ Building the program with BLAS support may lead to some performance improvements
 
 - #### Vulkan
 
-> [!WARNING]
->
-> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
-> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
-> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
-> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
->
-> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
-
   **With docker**:
 
   You don't need to install Vulkan SDK. It will be installed inside the container.

Разница между файлами не показана из-за своего большого размера
+ 696 - 312
ggml-vulkan-shaders.hpp


Разница между файлами не показана из-за своего большого размера
+ 286 - 286
ggml-vulkan.cpp


+ 0 - 11
ggml-vulkan.h

@@ -11,17 +11,6 @@ extern "C" {
 #define GGML_VK_MAX_DEVICES 16
 
 GGML_API void ggml_vk_instance_init(void);
-GGML_API void ggml_vk_init_cpu_assist(void);
-
-GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
-GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
-GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
-GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#endif
-GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
-GGML_API void ggml_vk_free_cpu_assist(void);
 
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);

+ 0 - 35
ggml.c

@@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #include <Accelerate/Accelerate.h>
 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 #include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#include "ggml-vulkan.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #if defined(GGML_BLAS_USE_MKL)
@@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #endif
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#include "ggml-vulkan.h"
 #endif
 
 // floating point type used to accumulate sums
@@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
 #if defined(GGML_USE_CLBLAST)
         ggml_cl_init();
-#elif defined(GGML_USE_VULKAN)
-        ggml_vk_init_cpu_assist();
 #endif
 
         ggml_setup_op_has_task_pass();
@@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         return;
     }
 
-#if defined(GGML_USE_VULKAN)
-    const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    if (skip_cpu) {
-        ggml_vk_check_results_1_cpu_assist(params, tensor);
-    }
-#endif
-    if (skip_cpu) {
-        return;
-    }
-    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
-    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
-#endif // GGML_USE_VULKAN
-
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
@@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         }
     }
 
-#ifdef GGML_USE_VULKAN
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
-    }
-    ggml_vk_preallocate_buffers_cpu_assist();
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
-    }
-#endif
-
     const int n_threads = cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
@@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         }
     }
 
-#ifdef GGML_USE_VULKAN
-    ggml_vk_graph_cleanup_cpu_assist();
-#endif
-
     // performance stats (graph)
     {
         int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;

+ 235 - 46
ggml_vk_generate_shaders.py

@@ -18,6 +18,12 @@ shader_int8_ext = """
 """
 
 # Type-specific defines
+shader_f32_defines = """
+#define QUANT_K 1
+#define QUANT_R 1
+
+#define A_TYPE float
+"""
 shader_f16_defines = """
 #define QUANT_K 1
 #define QUANT_R 1
@@ -157,8 +163,8 @@ struct block_q6_K
 """
 
 # Dequant functions
-shader_f16_dequant_func = """
-#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
+shader_float_dequant_func = """
+#define DEQUANT_FUNC vec2 v = vec2(ib, ib);  // data_a[ib], data_a[ib + 1]);
 """
 
 shader_q4_0_dequant_func = """
@@ -410,6 +416,133 @@ mulmat_load_q8_0 = """
             buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
             buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);"""
 
+
+mulmat_load_q2_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                         // 2 values per idx
+            const uint iqs = idx % 128;                        // 0..127
+
+            const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
+            const uint scalesi = iqs / 8;                      // 0..15
+            const uint qsshift = ((iqs % 64) / 16) * 2;        // 0,2,4,6
+
+            const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
+            const uint scales = data_a[ib].scales[scalesi];
+            const vec2 d = vec2(data_a[ib].d);
+
+            const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);"""
+
+mulmat_load_q3_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                   // 2 values per idx
+            const uint iqs = idx % 128;                  // 0..127
+
+            const uint n = iqs / 64;                     // 0,1
+            const uint qsi = n * 32 + (iqs % 16) * 2;    // 0,2,4..62
+            const uint hmi =          (iqs % 16) * 2;    // 0,2,4..30
+            const uint j = (iqs % 64) / 4;               // 0..3
+            const uint is = iqs / 8;                     // 0..15
+            const uint halfsplit = ((iqs % 64) / 16);    // 0,1,2,3
+            const uint qsshift = halfsplit * 2;          // 0,2,4,6
+            const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128
+
+            const int8_t us = int8_t(is <  4 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+8] >> 0) & 3) << 4) :
+                                    is <  8 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+4] >> 2) & 3) << 4) :
+                                    is < 12 ? (data_a[ib].scales[is-8] >>  4) | (((data_a[ib].scales[is+0] >> 4) & 3) << 4) :
+                                            (data_a[ib].scales[is-8] >>  4) | (((data_a[ib].scales[is-4] >> 6) & 3) << 4));
+            const float dl = float(data_a[ib].d) * float(us - 32);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi    ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi    ] & m) != 0) ? 0 : 4)));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));"""
+
+mulmat_load_q4_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                 // 2 values per idx
+            const uint iqs = idx % 128;                // 0..127
+
+            const uint n = iqs / 32;                   // 0,1,2,3
+            const uint b = (iqs % 32) / 16;            // 0,1
+            const uint is = 2 * n + b;                 // 0..7
+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
+
+            const vec2 loadd = vec2(data_a[ib].d);
+
+            uint8_t sc;
+            uint8_t mbyte;
+            if (is < 4) {
+                sc    = uint8_t(data_a[ib].scales[is    ] & 63);
+                mbyte = uint8_t(data_a[ib].scales[is + 4] & 63);
+            } else {
+                sc    = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4));
+                mbyte = uint8_t((data_a[ib].scales[is + 4] >>  4) | ((data_a[ib].scales[is    ] >> 6) << 4));
+            }
+            const float d = loadd.x * sc;
+            const float m = loadd.y * mbyte;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) - m);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) - m);"""
+
+mulmat_load_q5_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                 // 2 values per idx
+            const uint iqs = idx % 128;                // 0..127
+
+            const uint n = iqs / 32;                   // 0,1,2,3
+            const uint b = (iqs % 32) / 16;            // 0,1
+            const uint is = 2 * n + b;                 // 0..7
+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
+            const uint qhi = (iqs % 16) * 2;           // 0,2,4..30
+
+            const uint8_t hm = uint8_t(1 << (iqs / 16));
+
+            const vec2 loadd = vec2(data_a[ib].d);
+
+            uint8_t sc;
+            uint8_t mbyte;
+            if (is < 4) {
+                sc    = uint8_t(data_a[ib].scales[is    ] & 63);
+                mbyte = uint8_t(data_a[ib].scales[is + 4] & 63);
+            } else {
+                sc    = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4));
+                mbyte = uint8_t((data_a[ib].scales[is + 4] >>  4) | ((data_a[ib].scales[is    ] >> 6) << 4));
+            }
+            const float d = loadd.x * sc;
+            const float m = loadd.y * mbyte;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi    ] & hm) != 0 ? 16 : 0)) - m);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0)) - m);"""
+
+mulmat_load_q6_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint iqs = idx % 128;                 // 0..127
+
+            const uint n = iqs / 64;                    // 0,1
+            const uint b = (iqs % 64) / 32;             // 0,1
+            const uint is_b = (iqs % 16) / 8;           // 0,1
+            const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
+            const uint is = 8 * n + qhshift + is_b;     // 0..15
+            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
+            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
+
+            const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));"""
+
 mulmat_body2 = """
         }
         [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
@@ -1611,8 +1744,9 @@ layout (push_constant) uniform parameter
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint d_offset;
     float param1; float param2;
-} p;
+} p;"""
 
+generic_unary_op_funcs = """
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@@ -1636,14 +1770,17 @@ uint dst_idx(uint idx) {
     const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
     const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
     return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
-}
+}"""
 
+generic_unary_op_main = """
 void main() {
     if (gl_GlobalInvocationID.x >= p.ne) {
         return;
     }
 """
 
+generic_unary_op_combined = f"{generic_unary_op_head}\n{generic_unary_op_funcs}\n{generic_unary_op_main}"
+
 generic_binary_op_head = """#version 450
 
 #extension GL_EXT_shader_16bit_storage : require
@@ -1655,13 +1792,14 @@ layout (push_constant) uniform parameter
     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
     uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
     uint d_offset;
-    uint param1; uint param2;
-} p;
+    float param1; float param2;
+} p;"""
 
+generic_binary_op_funcs = """
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {A_TYPE data_b[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
 
 uint src0_idx(uint idx) {
@@ -1693,14 +1831,17 @@ uint dst_idx(uint idx) {
     const uint i21 = (idx - i23_offset - i22_offset) / p.ne20;
     const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20;
     return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20;
-}
+}"""
 
+generic_binary_op_main = """
 void main() {
     if (gl_GlobalInvocationID.x >= p.ne) {
         return;
     }
 """
 
+generic_binary_op_combined = f"{generic_binary_op_head}\n{generic_binary_op_funcs}\n{generic_binary_op_main}"
+
 # MUL F32
 mul_body = """
     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
@@ -1745,39 +1886,55 @@ cpy_f16_f16_end = """
 """
 
 # GET_ROWS
-get_rows_body = """
-#extension GL_EXT_control_flow_attributes : enable
-#extension GL_EXT_shader_8bit_storage : require
+get_rows_float_body = """
+void main() {
+    const uint i00 = gl_GlobalInvocationID.x;
+    const uint i10 = gl_GlobalInvocationID.y;
+    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
+    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
 
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+    if (i00 >= p.ne00) {
+        return;
+    }
 
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {int data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
+    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
+
+    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
+    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
 
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
+#else
+    data_d[d_offset + i00] = data_a[a_offset + i00];
+#endif
+}
+"""
+
+get_rows_body = """
 void main() {
-    const uint col = int(gl_GlobalInvocationID.x) * 2;
-    const uint row = int(gl_GlobalInvocationID.y);
+    const uint i00 = (gl_GlobalInvocationID.x)*2;
+    const uint i10 = gl_GlobalInvocationID.y;
+    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
+    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
 
-    if (col >= p.KY) {
+    if (i00 >= p.ne00) {
         return;
     }
 
-    const uint r = uint(data_b[row]);
+    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
 
-    // copy data_a[r*p.KY + col] to dst[row*p.KX + col]
-    const uint xi = r*p.KY + col;
-    const uint di = row*p.KY + col;
+    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
+    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
 
-    const uint ib = xi/QUANT_K; // block index
-    const uint iqs = (xi%QUANT_K)/QUANT_R; // quant index
-    const uint iybs = di - di%QUANT_K; // y block start index
+    const uint ib = a_offset + i00/QUANT_K; // block index
+    const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
+    const uint iybs = i00 - i00%QUANT_K; // dst block start index
     const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
 
     DEQUANT_FUNC
 
-    dst[iybs + iqs + 0]        = D_TYPE(v.x);
-    dst[iybs + iqs + y_offset] = D_TYPE(v.y);
+    data_d[d_offset + iybs + iqs           ] = D_TYPE(v.x);
+    data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
 }
 """
 
@@ -2418,6 +2575,31 @@ async def main():
         tasks.append(string_to_spv("matmul_q8_0_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q8_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
         tasks.append(string_to_spv("matmul_q8_0_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q8_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
 
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q2_K_defines, mulmat_body1, mulmat_load_q2_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q2_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q2_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q2_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q2_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q3_K_defines, mulmat_body1, mulmat_load_q3_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q3_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q3_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q3_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q3_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_K_defines, mulmat_body1, mulmat_load_q4_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q4_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q4_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q4_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q4_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q5_K_defines, mulmat_body1, mulmat_load_q5_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q5_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q5_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q5_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q5_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q6_K_defines, mulmat_body1, mulmat_load_q6_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q6_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q6_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q6_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q6_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
     # Shaders where precision is needed, so no fp16 version
 
     # mul mat vec
@@ -2426,7 +2608,7 @@ async def main():
         stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
 
         if i == GGML_TYPE_F16:
-            stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
+            stream.extend((shader_f16_defines, shader_float_dequant_func, mul_mat_vec_body))
         elif i == GGML_TYPE_Q4_0:
             stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
         elif i == GGML_TYPE_Q4_1:
@@ -2488,25 +2670,32 @@ async def main():
     # get_rows
     for i in range(0, VK_NUM_TYPES):
         stream.clear()
-        stream.extend((generic_head, shader_int8_ext, shader_f32))
+        stream.extend((generic_binary_op_head, shader_int8_ext, shader_f32))
+        optimization_workaround = False
 
-        if i == GGML_TYPE_F16:
-            stream.extend((shader_f16_defines,  shader_f16_dequant_func,  get_rows_body))
+        if i == GGML_TYPE_F32:
+            stream.extend((shader_f32_defines, generic_binary_op_funcs, get_rows_float_body))
+        elif i == GGML_TYPE_F16:
+            stream.extend((shader_f16_defines, generic_binary_op_funcs, get_rows_float_body))
+            optimization_workaround = True
         elif i == GGML_TYPE_Q4_0:
-            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
+            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, generic_binary_op_funcs, get_rows_body))
         elif i == GGML_TYPE_Q4_1:
-            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
+            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, generic_binary_op_funcs, get_rows_body))
         elif i == GGML_TYPE_Q5_0:
-            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
+            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, generic_binary_op_funcs, get_rows_body))
         elif i == GGML_TYPE_Q5_1:
-            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
+            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, generic_binary_op_funcs, get_rows_body))
         elif i == GGML_TYPE_Q8_0:
-            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
+            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, generic_binary_op_funcs, get_rows_body))
         else:
             continue
 
-        tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
-        tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
+        if optimization_workaround:
+            tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
+        else:
+            tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t"}))
+        tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float"}))
 
     tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
     tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
@@ -2515,20 +2704,20 @@ async def main():
     tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
     tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
 
-    tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
-    tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_head}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
+    tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
+    tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_combined}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
 
-    tasks.append(string_to_spv("add_f32", f"{generic_binary_op_head}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("add_f32", f"{generic_binary_op_combined}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
 
     tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
-    tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_head}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_combined}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
 
-    tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_head}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_combined}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
 
-    tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_head}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_combined}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
 
-    tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_head}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_combined}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
 
     tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
     tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))

+ 14 - 5
llama.cpp

@@ -2121,10 +2121,6 @@ struct llama_context {
             ggml_backend_free(backend);
         }
 
-#ifdef GGML_USE_VULKAN
-        ggml_vk_free_cpu_assist();
-#endif
-
         ggml_backend_buffer_free(buf_output);
     }
 
@@ -14131,7 +14127,20 @@ struct llama_context * llama_new_context_with_model(
             }
         }
 #elif defined(GGML_USE_VULKAN)
-        if (model->n_gpu_layers > 0) {
+        if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+            LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
+            llama_free(ctx);
+            return nullptr;
+        }
+        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
+            ggml_backend_t backend = ggml_backend_vk_init(0);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        } else {
             for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
                 ggml_backend_t backend = ggml_backend_vk_init(device);
                 if (backend == nullptr) {

Некоторые файлы не были показаны из-за большого количества измененных файлов