8 kuukautta sitten · 514c45608f
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -313,7 +313,6 @@ struct ggml_backend_sycl_context {
 
				     int device;
			
 
				     std::string name;
			
 
				     optimize_feature opt_feature;
			
 
				-    bool optimized_graph=false;
			
 
				 
			
 
				     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
			
 
				 
			
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -192,7 +192,7 @@ static void ggml_check_sycl() try {
 
				 
			
 
				     if (!initialized) {
			
 
				         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
			
 
				-        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
			
 
				+        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
			
 
				         g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
			
 
				         GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
			
 
				         GGML_LOG_INFO("Running with Environment Variables:\n");
			
@@ -2852,6 +2852,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
 
				     }
			
 
				 }
			
 
				 
			
 
				+static void reorder_qw(char *data_device, const int ncols, const int nrows,
			
 
				+                size_t size, size_t offset, dpct::queue_ptr stream) {
			
 
				+    auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
			
 
				+    SYCL_CHECK(
			
 
				+        CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
			
 
				+            .wait()));
			
 
				+    GGML_ASSERT((size % sizeof(block_q4_0) == 0));
			
 
				+    GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
			
 
				+    int offset_blks = offset / sizeof(block_q4_0);
			
 
				+    auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
			
 
				+    auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
			
 
				+
			
 
				+    stream->parallel_for(
			
 
				+        size / sizeof(block_q4_0),
			
 
				+            [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
			
 
				+            const block_q4_0* x = (const block_q4_0*)tmp_buf;
			
 
				+            const int ib = i;
			
 
				+
			
 
				+            for (int j = 0; j < QK4_0/2; j ++)
			
 
				+            {
			
 
				+                *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
			
 
				+            }
			
 
				+            *(d_ptr + ib) = x[ib].d;
			
 
				+        });
			
 
				+
			
 
				+    sycl::free(tmp_buf, *stream);
			
 
				+}
			
 
				+
			
 
				+static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
			
 
				+    char*data_device = (char*)src0->data;
			
 
				+    size_t ncols = src0->ne[0];
			
 
				+    size_t nrows = src0->ne[1];
			
 
				+    size_t size = ggml_nbytes(src0);
			
 
				+
			
 
				+    reorder_qw(data_device, ncols, nrows, size, 0, stream);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+* This function could be called when the OP (mul_mat) function support reorder optimizition.
			
 
				+*/
			
 
				+static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
			
 
				+    ggml_tensor * dst) {
			
 
				+    if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
			
 
				+        ctx->opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf.
			
 
				+        dst->op == GGML_OP_MUL_MAT &&    //limit to some supported cases of Q4_0, to do for more cases.
			
 
				+        src0->type == GGML_TYPE_Q4_0 &&
			
 
				+        src1->ne[2]==1 && src1->ne[3]==1) {
			
 
				+
			
 
				+        ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
			
 
				+        if (!extra) return; //only happen in CI/UT permute case.
			
 
				+
			
 
				+        if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
			
 
				+
			
 
				+        reorder_qw(src0, ctx->stream());
			
 
				+        extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
			
 
				 
			
 
				     const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
			
@@ -2914,6 +2972,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
 
				         // KQ + KQV multi-batch
			
 
				         ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
			
 
				     } else if (use_dequantize_mul_mat_vec) {
			
 
				+        opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
			
 
				         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
			
 
				         // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
			
 
				     } else if (use_mul_mat_vec_q) {
			
@@ -2921,6 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
 
				     } else if (use_mul_mat_q) {
			
 
				         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
			
 
				     } else {
			
 
				+        opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
			
 
				         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
			
 
				     }
			
 
				 }
			
@@ -3545,71 +3605,8 @@ catch (sycl::exception const &exc) {
 
				   std::exit(1);
			
 
				 }
			
 
				 
			
 
				-static void reorder_qw(char *data_device, const int ncols, const int nrows,
			
 
				-                size_t size, size_t offset, dpct::queue_ptr stream) {
			
 
				-    auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
			
 
				-    SYCL_CHECK(
			
 
				-        CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
			
 
				-            .wait()));
			
 
				-    GGML_ASSERT((size % sizeof(block_q4_0) == 0));
			
 
				-    GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
			
 
				-    int offset_blks = offset / sizeof(block_q4_0);
			
 
				-    auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
			
 
				-    auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
			
 
				-
			
 
				-    stream->parallel_for(
			
 
				-        size / sizeof(block_q4_0),
			
 
				-            [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
			
 
				-            const block_q4_0* x = (const block_q4_0*)tmp_buf;
			
 
				-            const int ib = i;
			
 
				-
			
 
				-            for (int j = 0; j < QK4_0/2; j ++)
			
 
				-            {
			
 
				-                *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
			
 
				-            }
			
 
				-            *(d_ptr + ib) = x[ib].d;
			
 
				-        });
			
 
				-
			
 
				-    sycl::free(tmp_buf, *stream);
			
 
				-}
			
 
				-
			
 
				-static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
			
 
				-    char*data_device = (char*)src0->data;
			
 
				-    size_t ncols = src0->ne[0];
			
 
				-    size_t nrows = src0->ne[1];
			
 
				-    size_t size = ggml_nbytes(src0);
			
 
				-
			
 
				-    reorder_qw(data_device, ncols, nrows, size, 0, stream);
			
 
				-}
			
 
				-
			
 
				-static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
			
 
				-    ggml_tensor *src0 = dst->src[0];
			
 
				-    ggml_tensor *src1 = dst->src[1];
			
 
				-
			
 
				-    if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
			
 
				-        src1->ne[2]==1 && src1->ne[3]==1) {
			
 
				-        reorder_qw(src0, stream);
			
 
				-        ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
			
 
				-        GGML_ASSERT(extra);
			
 
				-        extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
			
 
				-    dpct::queue_ptr stream = ctx->stream();
			
 
				-    if (ctx->optimized_graph) {
			
 
				-       return;
			
 
				-    }
			
 
				-    ctx->optimized_graph = true;
			
 
				-
			
 
				-    for (int i = 0; i < cgraph->n_nodes; i++) {
			
 
				-        if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				 static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
			
 
				     ggml_sycl_set_main_device(sycl_ctx->device);
			
 
				-    if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
			
 
				 
			
 
				     for (int i = 0; i < cgraph->n_nodes; i++) {
			
 
				         ggml_tensor * node = cgraph->nodes[i];