2 месяцев назад · a44d77126c
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -14104,20 +14104,11 @@ size_t comp_size;
 
				 size_t comp_nb[GGML_MAX_DIMS];
			
 
				 size_t check_counter = 0;
			
 
				 static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
			
 
				-    ggml_tensor * tensor = cgraph->nodes[tensor_idx];
			
 
				+    ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops];
			
 
				     if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				-    bool fused_rms_norm_mul = false;
			
 
				-    int rms_norm_idx = -1;
			
 
				-    if (ctx->num_additional_fused_ops == 1 &&
			
 
				-        tensor->op == GGML_OP_RMS_NORM &&
			
 
				-        cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) {
			
 
				-        fused_rms_norm_mul = true;
			
 
				-        tensor = cgraph->nodes[tensor_idx + 1];
			
 
				-    }
			
 
				-
			
 
				     check_counter++;
			
 
				     if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
			
 
				         return;
			
@@ -14125,9 +14116,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
 
				 
			
 
				     VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
			
 
				 
			
 
				-    ggml_tensor * src0 = tensor->src[0];
			
 
				-    ggml_tensor * src1 = tensor->src[1];
			
 
				-
			
 
				     struct ggml_init_params iparams = {
			
 
				         /*.mem_size   =*/ 2ul*1024ul*1024ul*1024ul,
			
 
				         /*.mem_buffer =*/ NULL,
			
@@ -14137,328 +14125,339 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
 
				     struct ggml_context * ggml_ctx = ggml_init(iparams);
			
 
				 
			
 
				     std::array<struct ggml_tensor *, GGML_MAX_SRC> src_clone = {nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
			
 
				-    std::array<size_t, GGML_MAX_SRC> src_size = {};
			
 
				-    std::array<void *, GGML_MAX_SRC> src_buffer = {};
			
 
				     const char * srci_name[GGML_MAX_SRC] = {"src0", "src1", "src2", "src3", "src4", "src5", "src6", "src7", "src8", "src9"};
			
 
				 
			
 
				+    std::map<ggml_tensor *, ggml_tensor *> cloned_tensors;
			
 
				+    std::vector<void *> cloned_mallocs;
			
 
				+
			
 
				     struct ggml_tensor * tensor_clone = nullptr;
			
 
				 
			
 
				-    for (int i = 0; i < GGML_MAX_SRC; i++) {
			
 
				-        ggml_tensor * srci = tensor->src[i];
			
 
				-        if (fused_rms_norm_mul) {
			
 
				-            rms_norm_idx = tensor->src[0]->op == GGML_OP_RMS_NORM ? 0 : 1;
			
 
				-            ggml_tensor *rms_norm = tensor->src[rms_norm_idx];
			
 
				-            switch (i) {
			
 
				-            case 0: srci = rms_norm->src[0]; break;
			
 
				-            case 1: srci = tensor->src[1 - rms_norm_idx]; break;
			
 
				-            default: continue;
			
 
				+    for (int f = 0; f < ctx->num_additional_fused_ops + 1; ++f) {
			
 
				+        tensor = cgraph->nodes[tensor_idx + f];
			
 
				+        for (int i = 0; i < GGML_MAX_SRC; i++) {
			
 
				+            ggml_tensor * srci = tensor->src[i];
			
 
				+            if (srci == nullptr) {
			
 
				+                continue;
			
 
				             }
			
 
				-        }
			
 
				-        if (srci == nullptr) {
			
 
				-            continue;
			
 
				-        }
			
 
				-        ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci);
			
 
				-        size_t srci_size = ggml_nbytes(srci);
			
 
				-
			
 
				-        src_clone[i] = srci_clone;
			
 
				-        src_size[i] = ggml_nbytes(srci);
			
 
				-        src_buffer[i] = malloc(srci_size);
			
 
				-
			
 
				-        srci_clone->data = src_buffer[i];
			
 
				-        if (ggml_backend_buffer_is_host(srci->buffer)) {
			
 
				-            memcpy(srci_clone->data, srci->data, srci_size);
			
 
				-            memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
			
 
				-        } else if (ggml_backend_buffer_is_vk(srci->buffer)) {
			
 
				-            ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context;
			
 
				-            vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
			
 
				-            uint64_t offset = vk_tensor_offset(srci) + srci->view_offs;
			
 
				-            if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) {
			
 
				-                for (int i3 = 0; i3 < srci->ne[3]; i3++) {
			
 
				-                    for (int i2 = 0; i2 < srci->ne[2]; i2++) {
			
 
				-                        const int idx = i3*srci->ne[2] + i2;
			
 
				-                        ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]);
			
 
				+            // If a src tensor has been cloned, use that one
			
 
				+            auto it = cloned_tensors.find(srci);
			
 
				+            if (it != cloned_tensors.end()) {
			
 
				+                src_clone[i] = it->second;
			
 
				+                continue;
			
 
				+            }
			
 
				+            ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci);
			
 
				+            size_t srci_size = ggml_nbytes(srci);
			
 
				+
			
 
				+            src_clone[i] = srci_clone;
			
 
				+            void *src_buffer = malloc(srci_size);
			
 
				+            cloned_mallocs.push_back(src_buffer);
			
 
				+
			
 
				+            srci_clone->data = src_buffer;
			
 
				+            if (ggml_backend_buffer_is_host(srci->buffer)) {
			
 
				+                memcpy(srci_clone->data, srci->data, srci_size);
			
 
				+                memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
			
 
				+            } else if (ggml_backend_buffer_is_vk(srci->buffer)) {
			
 
				+                ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context;
			
 
				+                vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
			
 
				+                uint64_t offset = vk_tensor_offset(srci) + srci->view_offs;
			
 
				+                if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) {
			
 
				+                    for (int i3 = 0; i3 < srci->ne[3]; i3++) {
			
 
				+                        for (int i2 = 0; i2 < srci->ne[2]; i2++) {
			
 
				+                            const int idx = i3*srci->ne[2] + i2;
			
 
				+                            ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]);
			
 
				+                        }
			
 
				                     }
			
 
				-                }
			
 
				 
			
 
				-                srci_clone->nb[0] = srci->nb[0];
			
 
				-                srci_clone->nb[1] = srci->nb[1];
			
 
				-                for (int i = 2; i < GGML_MAX_DIMS; i++) {
			
 
				-                    srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1];
			
 
				+                    srci_clone->nb[0] = srci->nb[0];
			
 
				+                    srci_clone->nb[1] = srci->nb[1];
			
 
				+                    for (int i = 2; i < GGML_MAX_DIMS; i++) {
			
 
				+                        srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1];
			
 
				+                    }
			
 
				+                } else {
			
 
				+                    if (offset + srci_size >= buffer_gpu->size) {
			
 
				+                        srci_size = buffer_gpu->size - offset;
			
 
				+                    }
			
 
				+                    ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size);
			
 
				+                    memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
			
 
				                 }
			
 
				             } else {
			
 
				-                if (offset + srci_size >= buffer_gpu->size) {
			
 
				-                    srci_size = buffer_gpu->size - offset;
			
 
				-                }
			
 
				-                ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size);
			
 
				-                memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
			
 
				+                GGML_ABORT("fatal error");
			
 
				             }
			
 
				-        } else {
			
 
				-            GGML_ABORT("fatal error");
			
 
				-        }
			
 
				 
			
 
				-        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
			
 
				-            ggml_vk_print_tensor(srci, srci_name[i]);
			
 
				+            if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
			
 
				+                ggml_vk_print_tensor(srci, srci_name[i]);
			
 
				+            }
			
 
				         }
			
 
				-    }
			
 
				 
			
 
				-    if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
			
 
				-        const float * params = (const float *)tensor->op_params;
			
 
				-        tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
			
 
				-        if (src_clone[4]) {
			
 
				-            ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
			
 
				-        }
			
 
				-    } else if (tensor->op == GGML_OP_MUL_MAT) {
			
 
				-        tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-    } else if (tensor->op == GGML_OP_MUL_MAT_ID) {
			
 
				-        tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
			
 
				-    } else if (tensor->op == GGML_OP_SUB) {
			
 
				-        tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-    } else if (tensor->op == GGML_OP_MUL) {
			
 
				-        if (fused_rms_norm_mul) {
			
 
				-            tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->src[rms_norm_idx]->op_params);
			
 
				-            tensor_clone = ggml_mul(ggml_ctx, tensor_clone, src_clone[1 - rms_norm_idx]);
			
 
				-        } else {
			
 
				+        if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
			
 
				+            const float * params = (const float *)tensor->op_params;
			
 
				+            tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
			
 
				+            if (src_clone[4]) {
			
 
				+                ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
			
 
				+            }
			
 
				+        } else if (tensor->op == GGML_OP_MUL_MAT) {
			
 
				+            tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				+        } else if (tensor->op == GGML_OP_MUL_MAT_ID) {
			
 
				+            tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
			
 
				+        } else if (tensor->op == GGML_OP_SUB) {
			
 
				+            tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				+        } else if (tensor->op == GGML_OP_MUL) {
			
 
				             tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-        }
			
 
				-    } else if (tensor->op == GGML_OP_DIV) {
			
 
				-        tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-    } else if (tensor->op == GGML_OP_CONCAT) {
			
 
				-        tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
			
 
				-    } else if (tensor->op == GGML_OP_UPSCALE) {
			
 
				-        tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
			
 
				-    } else if (tensor->op == GGML_OP_SCALE) {
			
 
				-        const float * params = (const float *)tensor->op_params;
			
 
				-        tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
			
 
				-    } else if (tensor->op == GGML_OP_SQR) {
			
 
				-        tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_SQRT) {
			
 
				-        tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_SIN) {
			
 
				-        tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_COS) {
			
 
				-        tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_CLAMP) {
			
 
				-        const float * params = (const float *)tensor->op_params;
			
 
				-        tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
			
 
				-    } else if (tensor->op == GGML_OP_PAD) {
			
 
				-        tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
			
 
				-                                                            tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
			
 
				-    } else if (tensor->op == GGML_OP_REPEAT) {
			
 
				-        tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
			
 
				-    } else if (tensor->op == GGML_OP_REPEAT_BACK) {
			
 
				-        tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor);
			
 
				-    } else if (tensor->op == GGML_OP_ADD) {
			
 
				-        tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-    } else if (tensor->op == GGML_OP_ACC) {
			
 
				-        tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
			
 
				-    } else if (tensor->op == GGML_OP_NORM) {
			
 
				-        tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
			
 
				-    } else if (tensor->op == GGML_OP_GROUP_NORM) {
			
 
				-        const float * float_params = (const float *)tensor->op_params;
			
 
				-        tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]);
			
 
				-    } else if (tensor->op == GGML_OP_RMS_NORM) {
			
 
				-        tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
			
 
				-    } else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
			
 
				-        const float eps = ((float *) tensor->op_params)[0];
			
 
				-        tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
			
 
				-    } else if (tensor->op == GGML_OP_SILU_BACK) {
			
 
				-        tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-    } else if (tensor->op == GGML_OP_L2_NORM) {
			
 
				-        const float eps = ((float *) tensor->op_params)[0];
			
 
				-        tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
			
 
				-    } else if (tensor->op == GGML_OP_SOFT_MAX) {
			
 
				-        if (src1 != nullptr) {
			
 
				+        } else if (tensor->op == GGML_OP_DIV) {
			
 
				+            tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				+        } else if (tensor->op == GGML_OP_CONCAT) {
			
 
				+            tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
			
 
				+        } else if (tensor->op == GGML_OP_UPSCALE) {
			
 
				+            tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
			
 
				+        } else if (tensor->op == GGML_OP_SCALE) {
			
 
				             const float * params = (const float *)tensor->op_params;
			
 
				-            tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]);
			
 
				-        } else {
			
 
				-            tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]);
			
 
				-        }
			
 
				-    } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) {
			
 
				-        tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
			
 
				-    } else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
			
 
				-        tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]);
			
 
				-    } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) {
			
 
				-        const int n_dims      = ((int32_t *) tensor->op_params)[1];
			
 
				-        const int mode        = ((int32_t *) tensor->op_params)[2];
			
 
				-        //const int n_ctx_ggml       = ((int32_t *) tensor->op_params)[3];
			
 
				-        const int n_ctx_orig_ggml  = ((int32_t *) tensor->op_params)[4];
			
 
				-        const float freq_base       = ((float *) tensor->op_params)[5];
			
 
				-        const float freq_scale      = ((float *) tensor->op_params)[6];
			
 
				-        const float ext_factor      = ((float *) tensor->op_params)[7];
			
 
				-        const float attn_factor     = ((float *) tensor->op_params)[8];
			
 
				-        const float beta_fast       = ((float *) tensor->op_params)[9];
			
 
				-        const float beta_slow       = ((float *) tensor->op_params)[10];
			
 
				-        if (mode & GGML_ROPE_TYPE_MROPE) {
			
 
				-            int32_t *sections = ((int32_t *) tensor->op_params) + 11;
			
 
				-            if (tensor->op == GGML_OP_ROPE) {
			
 
				-                tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+            tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
			
 
				+        } else if (tensor->op == GGML_OP_SQR) {
			
 
				+            tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_SQRT) {
			
 
				+            tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_SIN) {
			
 
				+            tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_COS) {
			
 
				+            tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_CLAMP) {
			
 
				+            const float * params = (const float *)tensor->op_params;
			
 
				+            tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
			
 
				+        } else if (tensor->op == GGML_OP_PAD) {
			
 
				+            tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
			
 
				+                                                                tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
			
 
				+        } else if (tensor->op == GGML_OP_REPEAT) {
			
 
				+            tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
			
 
				+        } else if (tensor->op == GGML_OP_REPEAT_BACK) {
			
 
				+            tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor);
			
 
				+        } else if (tensor->op == GGML_OP_ADD) {
			
 
				+            tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				+        } else if (tensor->op == GGML_OP_ACC) {
			
 
				+            tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
			
 
				+        } else if (tensor->op == GGML_OP_NORM) {
			
 
				+            tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
			
 
				+        } else if (tensor->op == GGML_OP_GROUP_NORM) {
			
 
				+            const float * float_params = (const float *)tensor->op_params;
			
 
				+            tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]);
			
 
				+        } else if (tensor->op == GGML_OP_RMS_NORM) {
			
 
				+            tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
			
 
				+        } else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
			
 
				+            const float eps = ((float *) tensor->op_params)[0];
			
 
				+            tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
			
 
				+        } else if (tensor->op == GGML_OP_SILU_BACK) {
			
 
				+            tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				+        } else if (tensor->op == GGML_OP_L2_NORM) {
			
 
				+            const float eps = ((float *) tensor->op_params)[0];
			
 
				+            tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
			
 
				+        } else if (tensor->op == GGML_OP_SOFT_MAX) {
			
 
				+            if (tensor->src[1] != nullptr) {
			
 
				+                const float * params = (const float *)tensor->op_params;
			
 
				+                tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]);
			
 
				             } else {
			
 
				-                tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+                tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]);
			
 
				             }
			
 
				-        } else {
			
 
				-            if (tensor->op == GGML_OP_ROPE) {
			
 
				-                tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+        } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) {
			
 
				+            tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
			
 
				+        } else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
			
 
				+            tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]);
			
 
				+        } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) {
			
 
				+            const int n_dims      = ((int32_t *) tensor->op_params)[1];
			
 
				+            const int mode        = ((int32_t *) tensor->op_params)[2];
			
 
				+            //const int n_ctx_ggml       = ((int32_t *) tensor->op_params)[3];
			
 
				+            const int n_ctx_orig_ggml  = ((int32_t *) tensor->op_params)[4];
			
 
				+            const float freq_base       = ((float *) tensor->op_params)[5];
			
 
				+            const float freq_scale      = ((float *) tensor->op_params)[6];
			
 
				+            const float ext_factor      = ((float *) tensor->op_params)[7];
			
 
				+            const float attn_factor     = ((float *) tensor->op_params)[8];
			
 
				+            const float beta_fast       = ((float *) tensor->op_params)[9];
			
 
				+            const float beta_slow       = ((float *) tensor->op_params)[10];
			
 
				+            if (mode & GGML_ROPE_TYPE_MROPE) {
			
 
				+                int32_t *sections = ((int32_t *) tensor->op_params) + 11;
			
 
				+                if (tensor->op == GGML_OP_ROPE) {
			
 
				+                    tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+                } else {
			
 
				+                    tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+                }
			
 
				+            } else {
			
 
				+                if (tensor->op == GGML_OP_ROPE) {
			
 
				+                    tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+                } else {
			
 
				+                    tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+                }
			
 
				+            }
			
 
				+        } else if (tensor->op == GGML_OP_UNARY) {
			
 
				+            switch (ggml_get_unary_op(tensor)) {
			
 
				+            case GGML_UNARY_OP_EXP:
			
 
				+                tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_SILU:
			
 
				+                tensor_clone = ggml_silu(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_GELU:
			
 
				+                tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_GELU_ERF:
			
 
				+                tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_GELU_QUICK:
			
 
				+                tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_RELU:
			
 
				+                tensor_clone = ggml_relu(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_TANH:
			
 
				+                tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_SIGMOID:
			
 
				+                tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_HARDSIGMOID:
			
 
				+                tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            case GGML_UNARY_OP_HARDSWISH:
			
 
				+                tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]);
			
 
				+                break;
			
 
				+            default:
			
 
				+                std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
			
 
				+                GGML_ABORT("fatal error");
			
 
				+            }
			
 
				+        } else if (tensor->op == GGML_OP_GLU) {
			
 
				+            if (src_clone[1] == nullptr) {
			
 
				+                tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
			
 
				+            } else {
			
 
				+                tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]);
			
 
				+            }
			
 
				+            ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2));
			
 
				+            ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3));
			
 
				+        } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
			
 
				+            if (tensor->src[1] == nullptr) {
			
 
				+                tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
			
 
				+                tensor_clone->type = tensor->type;
			
 
				             } else {
			
 
				-                tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
			
 
				+                tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				             }
			
 
				+        } else if (tensor->op == GGML_OP_CONT) {
			
 
				+            tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
			
 
				+        } else if (tensor->op == GGML_OP_RESHAPE) {
			
 
				+            tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
			
 
				+        } else if (tensor->op == GGML_OP_VIEW) {
			
 
				+            tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
			
 
				+        } else if (tensor->op == GGML_OP_PERMUTE) {
			
 
				+            int32_t * params = (int32_t *)tensor->op_params;
			
 
				+            tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]);
			
 
				+        } else if (tensor->op == GGML_OP_TRANSPOSE) {
			
 
				+            tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_GET_ROWS) {
			
 
				+            tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				+        } else if (tensor->op == GGML_OP_ARGSORT) {
			
 
				+            tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params);
			
 
				+        } else if (tensor->op == GGML_OP_SUM) {
			
 
				+            tensor_clone = ggml_sum(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_SUM_ROWS) {
			
 
				+            tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_MEAN) {
			
 
				+            tensor_clone = ggml_mean(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_ARGMAX) {
			
 
				+            tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]);
			
 
				+        } else if (tensor->op == GGML_OP_COUNT_EQUAL) {
			
 
				+            tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				+        } else if (tensor->op == GGML_OP_IM2COL) {
			
 
				+            const int32_t s0 = tensor->op_params[0];
			
 
				+            const int32_t s1 = tensor->op_params[1];
			
 
				+            const int32_t p0 = tensor->op_params[2];
			
 
				+            const int32_t p1 = tensor->op_params[3];
			
 
				+            const int32_t d0 = tensor->op_params[4];
			
 
				+            const int32_t d1 = tensor->op_params[5];
			
 
				+
			
 
				+            const bool is_2D = tensor->op_params[6] == 1;
			
 
				+            tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
			
 
				+        } else if (tensor->op == GGML_OP_IM2COL_3D) {
			
 
				+            const int32_t s0 = tensor->op_params[0];
			
 
				+            const int32_t s1 = tensor->op_params[1];
			
 
				+            const int32_t s2 = tensor->op_params[2];
			
 
				+            const int32_t p0 = tensor->op_params[3];
			
 
				+            const int32_t p1 = tensor->op_params[4];
			
 
				+            const int32_t p2 = tensor->op_params[5];
			
 
				+            const int32_t d0 = tensor->op_params[6];
			
 
				+            const int32_t d1 = tensor->op_params[7];
			
 
				+            const int32_t d2 = tensor->op_params[8];
			
 
				+            const int32_t IC = tensor->op_params[9];
			
 
				+
			
 
				+            tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
			
 
				+        } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
			
 
				+            const int32_t dim = tensor->op_params[0];
			
 
				+            const int32_t max_period = tensor->op_params[1];
			
 
				+            tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
			
 
				+        } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
			
 
				+            const int32_t s0 = tensor->op_params[0];
			
 
				+            const int32_t p0 = tensor->op_params[1];
			
 
				+            const int32_t d0 = tensor->op_params[2];
			
 
				+            tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
			
 
				+        } else if (tensor->op == GGML_OP_POOL_2D) {
			
 
				+            enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
			
 
				+            const int32_t k0 = tensor->op_params[1];
			
 
				+            const int32_t k1 = tensor->op_params[2];
			
 
				+            const int32_t s0 = tensor->op_params[3];
			
 
				+            const int32_t s1 = tensor->op_params[4];
			
 
				+            const int32_t p0 = tensor->op_params[5];
			
 
				+            const int32_t p1 = tensor->op_params[6];
			
 
				+
			
 
				+            tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
			
 
				+        } else if (tensor->op == GGML_OP_CONV_2D) {
			
 
				+            const int32_t s0 = tensor->op_params[0];
			
 
				+            const int32_t s1 = tensor->op_params[1];
			
 
				+            const int32_t p0 = tensor->op_params[2];
			
 
				+            const int32_t p1 = tensor->op_params[3];
			
 
				+            const int32_t d0 = tensor->op_params[4];
			
 
				+            const int32_t d1 = tensor->op_params[5];
			
 
				+            tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
			
 
				+        } else if (tensor->op == GGML_OP_CONV_2D_DW) {
			
 
				+            const int32_t s0 = tensor->op_params[0];
			
 
				+            const int32_t s1 = tensor->op_params[1];
			
 
				+            const int32_t p0 = tensor->op_params[2];
			
 
				+            const int32_t p1 = tensor->op_params[3];
			
 
				+            const int32_t d0 = tensor->op_params[4];
			
 
				+            const int32_t d1 = tensor->op_params[5];
			
 
				+            tensor_clone = ggml_conv_2d_dw_direct(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
			
 
				+        } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) {
			
 
				+            const int32_t s = tensor->op_params[0];
			
 
				+            tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s);
			
 
				+        } else if (tensor->op == GGML_OP_LEAKY_RELU) {
			
 
				+            const float * op_params = (const float *)tensor->op_params;
			
 
				+            tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
			
 
				+        } else if (tensor->op == GGML_OP_RWKV_WKV6) {
			
 
				+            tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1],
			
 
				+            src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
			
 
				+        } else if (tensor->op == GGML_OP_RWKV_WKV7) {
			
 
				+            tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3],
			
 
				+            src_clone[4], src_clone[5], src_clone[6]);
			
 
				+        } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
			
 
				+            src_clone[0]->flags = tensor->src[0]->flags;
			
 
				+            tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
			
 
				+            src_clone[2], src_clone[3], src_clone[4]);
			
 
				+        } else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
			
 
				+            src_clone[0]->flags = tensor->src[0]->flags;
			
 
				+            tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
			
 
				+            src_clone[2]);
			
 
				+        } else if (tensor->op == GGML_OP_ADD_ID) {
			
 
				+            tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
			
 
				+        } else if (tensor->op == GGML_OP_SSM_SCAN) {
			
 
				+            tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2],
			
 
				+                                         src_clone[3], src_clone[4], src_clone[5], src_clone[6]);
			
 
				+        } else if (tensor->op == GGML_OP_SSM_CONV) {
			
 
				+            tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				+        } else if (tensor->op == GGML_OP_ROLL) {
			
 
				+            const int32_t s0 = tensor->op_params[0];
			
 
				+            const int32_t s1 = tensor->op_params[1];
			
 
				+            const int32_t s2 = tensor->op_params[2];
			
 
				+            const int32_t s3 = tensor->op_params[3];
			
 
				+            tensor_clone = ggml_roll(ggml_ctx, src_clone[0], s0, s1, s2, s3);
			
 
				         }
			
 
				-    } else if (tensor->op == GGML_OP_UNARY) {
			
 
				-        switch (ggml_get_unary_op(tensor)) {
			
 
				-        case GGML_UNARY_OP_EXP:
			
 
				-            tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_SILU:
			
 
				-            tensor_clone = ggml_silu(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_GELU:
			
 
				-            tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_GELU_ERF:
			
 
				-            tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_GELU_QUICK:
			
 
				-            tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_RELU:
			
 
				-            tensor_clone = ggml_relu(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_TANH:
			
 
				-            tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_SIGMOID:
			
 
				-            tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_HARDSIGMOID:
			
 
				-            tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        case GGML_UNARY_OP_HARDSWISH:
			
 
				-            tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]);
			
 
				-            break;
			
 
				-        default:
			
 
				+        else {
			
 
				             std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
			
 
				             GGML_ABORT("fatal error");
			
 
				         }
			
 
				-    } else if (tensor->op == GGML_OP_GLU) {
			
 
				-        if (src_clone[1] == nullptr) {
			
 
				-            tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
			
 
				-        } else {
			
 
				-            tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]);
			
 
				-        }
			
 
				-        ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2));
			
 
				-        ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3));
			
 
				-    } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
			
 
				-        if (src1 == nullptr) {
			
 
				-            tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
			
 
				-            tensor_clone->type = tensor->type;
			
 
				-        } else {
			
 
				-            tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-        }
			
 
				-    } else if (tensor->op == GGML_OP_CONT) {
			
 
				-        tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
			
 
				-    } else if (tensor->op == GGML_OP_RESHAPE) {
			
 
				-        tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
			
 
				-    } else if (tensor->op == GGML_OP_VIEW) {
			
 
				-        tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
			
 
				-    } else if (tensor->op == GGML_OP_PERMUTE) {
			
 
				-        int32_t * params = (int32_t *)tensor->op_params;
			
 
				-        tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]);
			
 
				-    } else if (tensor->op == GGML_OP_TRANSPOSE) {
			
 
				-        tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_GET_ROWS) {
			
 
				-        tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-    } else if (tensor->op == GGML_OP_ARGSORT) {
			
 
				-        tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params);
			
 
				-    } else if (tensor->op == GGML_OP_SUM) {
			
 
				-        tensor_clone = ggml_sum(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_SUM_ROWS) {
			
 
				-        tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_MEAN) {
			
 
				-        tensor_clone = ggml_mean(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_ARGMAX) {
			
 
				-        tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]);
			
 
				-    } else if (tensor->op == GGML_OP_COUNT_EQUAL) {
			
 
				-        tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-    } else if (tensor->op == GGML_OP_IM2COL) {
			
 
				-        const int32_t s0 = tensor->op_params[0];
			
 
				-        const int32_t s1 = tensor->op_params[1];
			
 
				-        const int32_t p0 = tensor->op_params[2];
			
 
				-        const int32_t p1 = tensor->op_params[3];
			
 
				-        const int32_t d0 = tensor->op_params[4];
			
 
				-        const int32_t d1 = tensor->op_params[5];
			
 
				-
			
 
				-        const bool is_2D = tensor->op_params[6] == 1;
			
 
				-        tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
			
 
				-    } else if (tensor->op == GGML_OP_IM2COL_3D) {
			
 
				-        const int32_t s0 = tensor->op_params[0];
			
 
				-        const int32_t s1 = tensor->op_params[1];
			
 
				-        const int32_t s2 = tensor->op_params[2];
			
 
				-        const int32_t p0 = tensor->op_params[3];
			
 
				-        const int32_t p1 = tensor->op_params[4];
			
 
				-        const int32_t p2 = tensor->op_params[5];
			
 
				-        const int32_t d0 = tensor->op_params[6];
			
 
				-        const int32_t d1 = tensor->op_params[7];
			
 
				-        const int32_t d2 = tensor->op_params[8];
			
 
				-        const int32_t IC = tensor->op_params[9];
			
 
				-
			
 
				-        tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
			
 
				-    } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
			
 
				-        const int32_t dim = tensor->op_params[0];
			
 
				-        const int32_t max_period = tensor->op_params[1];
			
 
				-        tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
			
 
				-    } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
			
 
				-        const int32_t s0 = tensor->op_params[0];
			
 
				-        const int32_t p0 = tensor->op_params[1];
			
 
				-        const int32_t d0 = tensor->op_params[2];
			
 
				-        tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
			
 
				-    } else if (tensor->op == GGML_OP_POOL_2D) {
			
 
				-        enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
			
 
				-        const int32_t k0 = tensor->op_params[1];
			
 
				-        const int32_t k1 = tensor->op_params[2];
			
 
				-        const int32_t s0 = tensor->op_params[3];
			
 
				-        const int32_t s1 = tensor->op_params[4];
			
 
				-        const int32_t p0 = tensor->op_params[5];
			
 
				-        const int32_t p1 = tensor->op_params[6];
			
 
				-
			
 
				-        tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
			
 
				-    } else if (tensor->op == GGML_OP_CONV_2D) {
			
 
				-        const int32_t s0 = tensor->op_params[0];
			
 
				-        const int32_t s1 = tensor->op_params[1];
			
 
				-        const int32_t p0 = tensor->op_params[2];
			
 
				-        const int32_t p1 = tensor->op_params[3];
			
 
				-        const int32_t d0 = tensor->op_params[4];
			
 
				-        const int32_t d1 = tensor->op_params[5];
			
 
				-        tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
			
 
				-    } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) {
			
 
				-        const int32_t s = tensor->op_params[0];
			
 
				-        tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s);
			
 
				-    } else if (tensor->op == GGML_OP_LEAKY_RELU) {
			
 
				-        const float * op_params = (const float *)tensor->op_params;
			
 
				-        tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
			
 
				-    } else if (tensor->op == GGML_OP_RWKV_WKV6) {
			
 
				-        tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1],
			
 
				-        src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
			
 
				-    } else if (tensor->op == GGML_OP_RWKV_WKV7) {
			
 
				-        tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3],
			
 
				-        src_clone[4], src_clone[5], src_clone[6]);
			
 
				-    } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
			
 
				-        src_clone[0]->flags = src0->flags;
			
 
				-        tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
			
 
				-        src_clone[2], src_clone[3], src_clone[4]);
			
 
				-    } else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
			
 
				-        src_clone[0]->flags = src0->flags;
			
 
				-        tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
			
 
				-        src_clone[2]);
			
 
				-    } else if (tensor->op == GGML_OP_ADD_ID) {
			
 
				-        tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
			
 
				-    } else if (tensor->op == GGML_OP_SSM_SCAN) {
			
 
				-        tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2],
			
 
				-                                     src_clone[3], src_clone[4], src_clone[5], src_clone[6]);
			
 
				-    } else if (tensor->op == GGML_OP_SSM_CONV) {
			
 
				-        tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]);
			
 
				-    }
			
 
				-    else {
			
 
				-        std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
			
 
				-        GGML_ABORT("fatal error");
			
 
				+        cloned_tensors[tensor] = tensor_clone;
			
 
				     }
			
 
				 
			
 
				     ggml_cgraph * cgraph_cpu = ggml_new_graph(ggml_ctx);
			
@@ -14476,10 +14475,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
 
				     memcpy(comp_result, tensor_clone->data, comp_size);
			
 
				     memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS);
			
 
				 
			
 
				-    for (int i = 0; i < GGML_MAX_SRC; i++) {
			
 
				-        if (src_buffer[i] != nullptr) {
			
 
				-            free(src_buffer[i]);
			
 
				-        }
			
 
				+    for (auto m : cloned_mallocs) {
			
 
				+        free(m);
			
 
				     }
			
 
				 
			
 
				     ggml_free(ggml_ctx);
			
@@ -14488,15 +14485,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
 
				 }
			
 
				 
			
 
				 static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
			
 
				-    ggml_tensor * tensor = cgraph->nodes[tensor_idx];
			
 
				+    ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops];
			
 
				     if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
			
 
				         return;
			
 
				     }
			
 
				-    if (ctx->num_additional_fused_ops == 1 &&
			
 
				-        tensor->op == GGML_OP_RMS_NORM &&
			
 
				-        cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) {
			
 
				-        tensor = cgraph->nodes[tensor_idx + 1];
			
 
				-    }
			
 
				 
			
 
				     if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
			
 
				         return;