hai 9 meses · 6c02a032fa
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -66,41 +66,6 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
 
															   return sycl_down_blk_size;
														
 
															 }
														
 
															-void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                 const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                 const ggml_sycl_op_flatten_t op) try {
														
 
															-
														
 
															-    const bool use_src1 = src1 != nullptr;
														
 
															-    if(use_src1)
														
 
															-      GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
														
 
															-    GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
														
 
															-
														
 
															-    // dd = data device
														
 
															-    float * src0_ddf = (float *) src0->data;
														
 
															-    float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
														
 
															-    float *  dst_ddf = (float *) dst->data;
														
 
															-
														
 
															-    ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
														
 
															-    ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
														
 
															-    ggml_sycl_pool_alloc<float>  dst_f(ctx.pool());
														
 
															-
														
 
															-    ggml_sycl_set_device(ctx.device);
														
 
															-    queue_ptr main_stream = ctx.stream();
														
 
															-    // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
														
 
															-        // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
														
 
															-
														
 
															-    // do the computation
														
 
															-    op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
														
 
															-    // print_ggml_tensor("tensor", dst);
														
 
															-}
														
 
															-catch (sycl::exception const &exc) {
														
 
															-
														
 
															-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
														
 
															-            << ", line:" << __LINE__ << std::endl;
														
 
															-  std::exit(1);
														
 
															-}
														
 
															-
														
 
															-
														
 
															 void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
														
 
															     for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
														
 
															         for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
														
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -494,12 +494,6 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
 
															 int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
														
 
															-typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                       const ggml_tensor *src1,
														
 
															-                                       ggml_tensor *dst, const float *src0_dd,
														
 
															-                                       const float *src1_dd, float *dst_dd,
														
 
															-                                       const queue_ptr &main_stream);
														
 
															-
														
 
															 template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
														
 
															 static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
														
 
															         int ne0, int ne1, int ne2, int ne3,
														
@@ -757,24 +751,22 @@ struct bin_bcast_sycl {
 
															 template <class op>
														
 
															 inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd,
														
 
															-                                   const queue_ptr &main_stream) {
														
 
															+                                   const ggml_tensor *src1, ggml_tensor *dst) {
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															     if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
														
 
															-        op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
														
 
															+        op()(ctx, src0, src1, dst, (const float *)src0->data, (const float *)src1->data, (float *)dst->data, main_stream);
														
 
															     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
														
 
															-        op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
														
 
															-             (sycl::half *)dst_dd, main_stream);
														
 
															+        op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data,
														
 
															+             (sycl::half *)dst->data, main_stream);
														
 
															     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
														
 
															-        op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
														
 
															+        op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data, (float *)dst->data,
														
 
															              main_stream);
														
 
															     } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
														
 
															-        op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
														
 
															+        op()(ctx, src0, src1, dst, (const int32_t *)src0->data, (const int32_t *)src1->data, (int32_t *)dst->data,
														
 
															              main_stream);
														
 
															     } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
														
 
															-        op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
														
 
															+        op()(ctx, src0, src1, dst, (const int16_t *)src0->data, (const int16_t *)src1->data, (int16_t *)dst->data,
														
 
															              main_stream);
														
 
															     } else {
														
 
															         fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
														
@@ -784,8 +776,4 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
 
															 }
														
 
															 bool gpu_has_xmx(sycl::device &dev);
														
 
															-
														
 
															-void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                 const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                 const ggml_sycl_op_flatten_t op);
														
 
															 #endif // GGML_SYCL_COMMON_HPP
														
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -509,497 +509,409 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
 
															         });
														
 
															 }
														
 
															-inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                              ggml_tensor *dst, const float *src0_dd,
														
 
															-                              const float *src1_dd, float *dst_dd,
														
 
															-                              const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                              ggml_tensor *dst, const float *src0_dd,
														
 
															-                              const float *src1_dd, float *dst_dd,
														
 
															-                              const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                    const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                    const float *src0_dd, const float *src1_dd,
														
 
															-                                    float *dst_dd,
														
 
															-                                    const queue_ptr &main_stream) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                              ggml_tensor *dst, const float *src0_dd,
														
 
															-                              const float *src1_dd, float *dst_dd,
														
 
															-                              const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															-    tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															+    tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                              ggml_tensor *dst, const float *src0_dd,
														
 
															-                              const float *src1_dd, float *dst_dd,
														
 
															-                              const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                     const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                     const float *src0_dd, const float *src1_dd,
														
 
															-                                     float *dst_dd,
														
 
															-                                     const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx,  ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    log_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    step_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                   const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                   const float *src0_dd, const float *src1_dd,
														
 
															-                                   float *dst_dd, const queue_ptr &main_stream) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                    const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                    const float *src0_dd, const float *src1_dd,
														
 
															-                                    float *dst_dd,
														
 
															-                                    const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     float negative_slope;
														
 
															     memcpy(&negative_slope, dst->op_params, sizeof(float));
														
 
															-    leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                             ggml_tensor *dst, const float *src0_dd,
														
 
															-                             const float *src1_dd, float *dst_dd,
														
 
															-                             const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                 const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                 const float *src0_dd, const float *src1_dd,
														
 
															-                                 float *dst_dd,
														
 
															-                                 const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    const float sf0 = (float)dst->ne[0]/src0->ne[0];
														
 
															-    const float sf1 = (float)dst->ne[1]/src0->ne[1];
														
 
															-    const float sf2 = (float)dst->ne[2]/src0->ne[2];
														
 
															-    const float sf3 = (float)dst->ne[3]/src0->ne[3];
														
 
															+    const float sf0 = (float)dst->ne[0]/dst->src[0]->ne[0];
														
 
															+    const float sf1 = (float)dst->ne[1]/dst->src[0]->ne[1];
														
 
															+    const float sf2 = (float)dst->ne[2]/dst->src[0]->ne[2];
														
 
															+    const float sf3 = (float)dst->ne[3]/dst->src[0]->ne[3];
														
 
															-    upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
														
 
															+    upscale_f32_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], dst->src[0]->nb[3],
														
 
															                      dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
														
 
															                      main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
 
															-inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                             ggml_tensor *dst, const float *src0_dd,
														
 
															-                             const float *src1_dd, float *dst_dd,
														
 
															-                             const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
														
 
															+    GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     pad_f32_sycl(src0_dd, dst_dd,
														
 
															-        src0->ne[0], src0->ne[1], src0->ne[2],
														
 
															+        dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2],
														
 
															         dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
 
															-inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                             ggml_tensor *dst, const float *src0_dd,
														
 
															-                             const float *src1_dd, float *dst_dd,
														
 
															-                             const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    const float * src1_dd = static_cast<const float*>(dst->src[1]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
														
 
															     int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
														
 
															     // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
														
 
															     int offset = dst->op_params[3] / 4; // offset in bytes
														
 
															-    acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                             ggml_tensor *dst, const float *src0_dd,
														
 
															-                             const float *src1_dd, float *dst_dd,
														
 
															-                             const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
														
 
															+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
														
 
															 }
														
 
															-inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                             ggml_tensor *dst, const float *src0_dd,
														
 
															-                             const float *src1_dd, float *dst_dd,
														
 
															-                             const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
														
 
															+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
														
 
															 }
														
 
															-inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                             ggml_tensor *dst, const float *src0_dd,
														
 
															-                             const float *src1_dd, float *dst_dd,
														
 
															-                             const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
														
 
															+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
														
 
															 }
														
 
															-inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                             ggml_tensor *dst, const float *src0_dd,
														
 
															-                             const float *src1_dd, float *dst_dd,
														
 
															-                             const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
														
 
															+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, dst->src[0], dst->src[1], dst);
														
 
															 }
														
 
															 void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqrt);
														
 
															+    ggml_sycl_op_sqrt(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sin);
														
 
															+    ggml_sycl_op_sin(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_cos);
														
 
															+    ggml_sycl_op_cos(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_acc);
														
 
															+    ggml_sycl_op_acc(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu);
														
 
															+    ggml_sycl_op_gelu(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_silu);
														
 
															+    ggml_sycl_op_silu(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu_quick);
														
 
															+    ggml_sycl_op_gelu_quick(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_tanh);
														
 
															+    ggml_sycl_op_tanh(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_relu);
														
 
															+    ggml_sycl_op_relu(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sigmoid);
														
 
															+    ggml_sycl_op_sigmoid(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardsigmoid);
														
 
															+    ggml_sycl_op_hardsigmoid(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardswish);
														
 
															+    ggml_sycl_op_hardswish(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_exp);
														
 
															+    ggml_sycl_op_exp(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_log);
														
 
															+    ggml_sycl_op_log(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_neg);
														
 
															+    ggml_sycl_op_neg(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_step);
														
 
															+    ggml_sycl_op_step(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_leaky_relu);
														
 
															+    ggml_sycl_op_leaky_relu(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqr);
														
 
															+    ggml_sycl_op_sqr(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_upscale);
														
 
															+    ggml_sycl_op_upscale(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pad);
														
 
															+    ggml_sycl_op_pad(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
@@ -1007,24 +919,24 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
															 void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add);
														
 
															+    ggml_sycl_op_add(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sub);
														
 
															+    ggml_sycl_op_sub(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_mul);
														
 
															+    ggml_sycl_op_mul(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_div);
														
 
															+    ggml_sycl_op_div(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
--- a/ggml/src/ggml-sycl/getrows.cpp
+++ b/ggml/src/ggml-sycl/getrows.cpp
@@ -257,50 +257,54 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
 
															     GGML_UNUSED(ctx);
														
 
															 }
														
 
															-void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                  const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                  const float *src0_d, const float *src1_d,
														
 
															-                                  float *dst_d, const queue_ptr &stream) {
														
 
															+void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
														
 
															+    GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32);
														
 
															     GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
														
 
															-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
														
 
															+    GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type));
														
 
															+    GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type));
														
 
															     GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
														
 
															-    const int32_t * src1_i32 = (const int32_t *) src1_d;
														
 
															-
														
 
															-    switch (src0->type) {
														
 
															+    const int32_t * src1_i32 = (const int32_t *) dst->src[1]->data;
														
 
															+    /* TODO: Refactor and remove duplicates */
														
 
															+    switch (dst->src[0]->type) {
														
 
															         case GGML_TYPE_F16:
														
 
															-            get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
														
 
															-                                src1_i32, dst_d, stream);
														
 
															+            get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::half *)dst->src[0]->data,
														
 
															+                                src1_i32, (float *)dst->data, ctx.stream());
														
 
															             break;
														
 
															         case GGML_TYPE_F32:
														
 
															-            get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															+            get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
														
 
															+            src1_i32, (float *)dst->data, ctx.stream());
														
 
															             break;
														
 
															         case GGML_TYPE_Q4_0:
														
 
															             if (ctx.opt_feature.reorder && dst->op == GGML_OP_MUL_MAT) {
														
 
															-                get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															+                get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
														
 
															+                src1_i32, (float *)dst->data, ctx.stream());
														
 
															             } else {
														
 
															-                get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															+                get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
														
 
															+                src1_i32, (float *)dst->data, ctx.stream());
														
 
															             }
														
 
															             break;
														
 
															         case GGML_TYPE_Q4_1:
														
 
															-            get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															+            get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
														
 
															+            src1_i32, (float *)dst->data, ctx.stream());
														
 
															             break;
														
 
															         case GGML_TYPE_Q5_0:
														
 
															-            get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															+            get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
														
 
															+            src1_i32, (float *)dst->data, ctx.stream());
														
 
															             break;
														
 
															         case GGML_TYPE_Q5_1:
														
 
															-            get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															+            get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
														
 
															+            src1_i32, (float *)dst->data, ctx.stream());
														
 
															             break;
														
 
															         case GGML_TYPE_Q8_0:
														
 
															-            get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															+            get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
														
 
															+            src1_i32, (float *)dst->data, ctx.stream());
														
 
															             break;
														
 
															         default:
														
 
															             // TODO: k-quants
														
 
															-            GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
														
 
															+            GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(dst->src[0]->type));
														
 
															             GGML_ABORT("fatal error");
														
 
															     }
														
 
															 }
														
--- a/ggml/src/ggml-sycl/getrows.hpp
+++ b/ggml/src/ggml-sycl/getrows.hpp
@@ -15,9 +15,6 @@
 
															 #include "common.hpp"
														
 
															-void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-    const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-    const float *src0_d, const float *src1_d,
														
 
															-    float *dst_d, const queue_ptr &stream);
														
 
															+void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
														
 
															 #endif // GGML_SYCL_GETROWS_HPP
														
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -1988,16 +1988,8 @@ catch (sycl::exception const &exc) {
 
															   std::exit(1);
														
 
															 }
														
 
															-static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                const float *src0_d, const float *src1_d,
														
 
															-                                float *dst_d,
														
 
															-                                const queue_ptr &main_stream) {
														
 
															-
														
 
															-    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(src1_d);
														
 
															+static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, dst->src[0], dst);
														
 
															 }
														
@@ -2132,13 +2124,14 @@ catch (sycl::exception const &exc) {
 
															   std::exit(1);
														
 
															 }
														
 
															-static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                const float *src0_dd, const float *src1_dd,
														
 
															-                                float *dst_dd, const queue_ptr &main_stream) {
														
 
															+static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     const int32_t * opts = (const int32_t *)dst->op_params;
														
 
															     enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
														
@@ -2149,8 +2142,8 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
 
															     const int p0 = opts[5];
														
 
															     const int p1 = opts[6];
														
 
															-    const int64_t IH = src0->ne[1];
														
 
															-    const int64_t IW = src0->ne[0];
														
 
															+    const int64_t IH = dst->src[0]->ne[1];
														
 
															+    const int64_t IW = dst->src[0]->ne[0];
														
 
															     const int64_t N = dst->ne[3];
														
 
															     const int64_t OC = dst->ne[2];
														
@@ -2169,163 +2162,125 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
 
															                                parallel_elements, src0_dd, dst_dd, op,
														
 
															                                item_ct1);
														
 
															         });
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
 
															-inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                  const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                  const float *src0_dd, const float *src1_dd,
														
 
															-                                  float *dst_dd,
														
 
															-                                  const queue_ptr &main_stream) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    const int64_t ne = ggml_nelements(src0);
														
 
															+    const int64_t ne = ggml_nelements(dst->src[0]);
														
 
															     sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
 
															-inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                  const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                  const float *src0_dd, const float *src1_dd,
														
 
															-                                  float *dst_dd,
														
 
															-                                  const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    const int64_t ncols = src0->ne[0];
														
 
															-    const int64_t nrows = ggml_nrows(src0);
														
 
															+    const int64_t ncols = dst->src[0]->ne[0];
														
 
															+    const int64_t nrows = ggml_nrows(dst->src[0]);
														
 
															     sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
 
															-inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                 const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                 const float *src0_dd, const float *src1_dd,
														
 
															-                                 float *dst_dd,
														
 
															-                                 const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->type == GGML_TYPE_I32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    int32_t *       dst_dd  = static_cast<int32_t *>(dst->data);
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
														
 
															-    const int64_t ncols = src0->ne[0];
														
 
															-    const int64_t nrows = ggml_nrows(src0);
														
 
															+    const int64_t ncols = dst->src[0]->ne[0];
														
 
															+    const int64_t nrows = ggml_nrows(dst->src[0]);
														
 
															     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
														
 
															-    argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                 const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-                                 const float *src0_dd, const float *src1_dd,
														
 
															-                                 float *dst_dd,
														
 
															-                                 const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_I32);
														
 
															-    const int64_t ncols = src0->ne[0];
														
 
															-    const int64_t nrows = ggml_nrows(src0);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    int32_t *       dst_dd  = static_cast<int32_t *>(dst->data);
														
 
															-    argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
														
 
															+    const int64_t ncols = dst->src[0]->ne[0];
														
 
															+    const int64_t nrows = ggml_nrows(dst->src[0]);
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
														
 
															 }
														
 
															-inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
														
 
															-                                       const ggml_tensor *src1,
														
 
															-                                       ggml_tensor *dst, const float *src0_dd,
														
 
															-                                       const float *src1_dd, float *dst_dd,
														
 
															-                                       const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															-    const int64_t ne00 = src0->ne[0];
														
 
															-    const int64_t ne01 = src0->ne[1];
														
 
															-    const int nrows0 = ggml_nrows(src0);
														
 
															+    const int64_t ne00 = dst->src[0]->ne[0];
														
 
															+    const int64_t ne01 = dst->src[0]->ne[1];
														
 
															+    const int nrows0 = ggml_nrows(dst->src[0]);
														
 
															     const int n_past = ((int32_t *) dst->op_params)[0];
														
 
															     diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
 
															-inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                               ggml_tensor *dst, const float *src0_dd,
														
 
															-                               const float *src1_dd, float *dst_dd,
														
 
															-                               const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     float scale;
														
 
															     memcpy(&scale, dst->op_params, sizeof(float));
														
 
															-    scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
														
 
															+    scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream);
														
 
															     /*
														
 
															     DPCT1010:87: SYCL uses exceptions to report errors and does not use the
														
 
															     error codes. The call was replaced with 0. You need to rewrite this code.
														
 
															     */
														
 
															     SYCL_CHECK(0);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
 
															-inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-                               ggml_tensor *dst, const float *src0_dd,
														
 
															-                               const float *src1_dd, float *dst_dd,
														
 
															-                               const queue_ptr &main_stream) {
														
 
															+inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     float min;
														
 
															     float max;
														
 
															     memcpy(&min, dst->op_params, sizeof(float));
														
 
															     memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
														
 
															-    clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
														
 
															+    clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), ctx.stream());
														
 
															     /*
														
 
															     DPCT1010:88: SYCL uses exceptions to report errors and does not use the
														
 
															     error codes. The call was replaced with 0. You need to rewrite this code.
														
 
															     */
														
 
															     SYCL_CHECK(0);
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
 
															 static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
														
@@ -2695,37 +2650,37 @@ catch (sycl::exception const &exc) {
 
															 static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_repeat);
														
 
															+    ggml_sycl_op_repeat(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_get_rows);
														
 
															+    ggml_sycl_op_get_rows(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_norm);
														
 
															+    ggml_sycl_op_norm(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rms_norm);
														
 
															+    ggml_sycl_op_rms_norm(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_l2_norm);
														
 
															+    ggml_sycl_op_l2_norm(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
 
															 static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_SYCL_DEBUG("call %s\n", __func__);
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm);
														
 
															+    ggml_sycl_op_group_norm(ctx, dst);
														
 
															     GGML_SYCL_DEBUG("call %s done\n", __func__);
														
 
															 }
														
@@ -3269,48 +3224,48 @@ catch (sycl::exception const &exc) {
 
															 }
														
 
															 static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_scale);
														
 
															+    ggml_sycl_op_scale(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp);
														
 
															+    ggml_sycl_op_clamp(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
														
 
															+    ggml_sycl_op_diag_mask_inf(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope);
														
 
															+    ggml_sycl_op_rope(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pool2d);
														
 
															+    ggml_sycl_op_pool2d(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_im2col);
														
 
															+    ggml_sycl_op_im2col(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum);
														
 
															+    ggml_sycl_op_sum(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum_rows);
														
 
															+    ggml_sycl_op_sum_rows(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argsort);
														
 
															+    ggml_sycl_op_argsort(ctx, dst);
														
 
															 }
														
 
															 static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
														
 
															-    ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argmax);
														
 
															+    ggml_sycl_op_argmax(ctx, dst);
														
 
															 }
														
@@ -3335,7 +3290,7 @@ catch (sycl::exception const &exc) {
 
															   std::exit(1);
														
 
															 }
														
 
															-static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
														
 
															+static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) try {
														
 
															     if (!g_sycl_loaded) return false;
														
 
															     if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
														
@@ -3528,6 +3483,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
 
															     }
														
 
															     return true;
														
 
															+} catch (sycl::exception & e) {
														
 
															+    std::cerr << e.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
														
 
															+    std::exit(1);
														
 
															 }
														
 
															 GGML_API void ggml_backend_sycl_get_device_description(int device, char *description,
														
--- a/ggml/src/ggml-sycl/im2col.cpp
+++ b/ggml/src/ggml-sycl/im2col.cpp
@@ -82,10 +82,9 @@ static void im2col_sycl(
 
															     }
														
 
															 }
														
 
															-void ggml_sycl_op_im2col(
														
 
															-        ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-        ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd,
														
 
															-        const queue_ptr &main_stream) {
														
 
															+void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															+    const ggml_tensor * src0 = dst->src[0];
														
 
															+    const ggml_tensor * src1 = dst->src[1];
														
 
															     GGML_ASSERT(src0->type == GGML_TYPE_F16);
														
 
															     GGML_ASSERT(src1->type == GGML_TYPE_F32);
														
@@ -115,12 +114,8 @@ void ggml_sycl_op_im2col(
 
															     const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
														
 
															     if (dst->type == GGML_TYPE_F16) {
														
 
															-        im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
														
 
															+        im2col_sycl((const float *) src1->data, (sycl::half *)dst->data, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, ctx.stream());
														
 
															     } else {
														
 
															-        im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
														
 
															+        im2col_sycl((const float *) src1->data, (float *)dst->data, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, ctx.stream());
														
 
															     }
														
 
															-
														
 
															-    GGML_UNUSED(src0);
														
 
															-    GGML_UNUSED(src0_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
--- a/ggml/src/ggml-sycl/im2col.hpp
+++ b/ggml/src/ggml-sycl/im2col.hpp
@@ -16,8 +16,6 @@
 
															 #include "common.hpp"
														
 
															 void ggml_sycl_op_im2col(
														
 
															-        ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
														
 
															-        ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd,
														
 
															-        const queue_ptr &main_stream);
														
 
															+        ggml_backend_sycl_context & ctx, ggml_tensor *dst);
														
 
															 #endif // GGML_SYCL_IM2COL_HPP
														
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -397,90 +397,78 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
 
															     }
														
 
															 }
														
 
															-void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
														
 
															-    ggml_tensor* dst, const float* src0_dd,
														
 
															-    const float* src1_dd, float* dst_dd,
														
 
															-    const queue_ptr& main_stream) {
														
 
															+void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															-    const int64_t ne00 = src0->ne[0];
														
 
															-    const int64_t nrows = ggml_nrows(src0);
														
 
															+    const int64_t ne00 = dst->src[0]->ne[0];
														
 
															+    const int64_t nrows = ggml_nrows(dst->src[0]);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     float eps;
														
 
															     memcpy(&eps, dst->op_params, sizeof(float));
														
 
															     norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
														
 
															-
														
 
															-    (void)src1;
														
 
															-    (void)dst;
														
 
															-    (void)src1_dd;
														
 
															 }
														
 
															-void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
														
 
															-    const ggml_tensor* src1, ggml_tensor* dst,
														
 
															-    const float* src0_dd, const float* src1_dd,
														
 
															-    float* dst_dd,
														
 
															-    const queue_ptr& main_stream) {
														
 
															+void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															     int num_groups = dst->op_params[0];
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     float eps;
														
 
															     memcpy(&eps, dst->op_params + 1, sizeof(float));
														
 
															-    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
														
 
															-    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
														
 
															-
														
 
															-    (void)src1;
														
 
															-    (void)dst;
														
 
															-    (void)src1_dd;
														
 
															-    GGML_UNUSED(ctx);
														
 
															+    int group_size = dst->src[0]->ne[0] * dst->src[0]->ne[1] * ((dst->src[0]->ne[2] + num_groups - 1) / num_groups);
														
 
															+    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, dst->src[0]->ne[0] * dst->src[0]->ne[1] * dst->src[0]->ne[2], main_stream, ctx.device);
														
 
															 }
														
 
															-void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
														
 
															-    const ggml_tensor* src1, ggml_tensor* dst,
														
 
															-    const float* src0_dd, const float* src1_dd,
														
 
															-    float* dst_dd,
														
 
															-    const queue_ptr& main_stream) {
														
 
															+void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															-    const int64_t ne00 = src0->ne[0];
														
 
															-    const int64_t nrows = ggml_nrows(src0);
														
 
															+    const int64_t ne00 = dst->src[0]->ne[0];
														
 
															+    const int64_t nrows = ggml_nrows(dst->src[0]);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float *       dst_dd  = static_cast<float *>(dst->data);
														
 
															     float eps;
														
 
															     memcpy(&eps, dst->op_params, sizeof(float));
														
 
															     rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
														
 
															-
														
 
															-    (void)src1;
														
 
															-    (void)dst;
														
 
															-    (void)src1_dd;
														
 
															 }
														
 
															-void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
														
 
															-    const ggml_tensor* src1, ggml_tensor* dst,
														
 
															-    const float* src0_dd, const float* src1_dd,
														
 
															-    float* dst_dd,
														
 
															-    const queue_ptr& main_stream) {
														
 
															+void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
														
 
															     GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															-    const int64_t ne00 = src0->ne[0];
														
 
															-    const int64_t nrows = ggml_nrows(src0);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+
														
 
															+    const int64_t ne00 = dst->src[0]->ne[0];
														
 
															+    const int64_t nrows = ggml_nrows(dst->src[0]);
														
 
															+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
														
 
															+    float * dst_dd = static_cast<float *>(dst->data);
														
 
															     float eps;
														
 
															     memcpy(&eps, dst->op_params, sizeof(float));
														
 
															     l2_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
														
 
															-    (void)src1;
														
 
															-    (void)dst;
														
 
															-    (void)src1_dd;
														
 
															 }
														
--- a/ggml/src/ggml-sycl/norm.hpp
+++ b/ggml/src/ggml-sycl/norm.hpp
@@ -15,27 +15,12 @@
 
															 #include "common.hpp"
														
 
															-void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
														
 
															-    ggml_tensor* dst, const float* src0_dd,
														
 
															-    const float* src1_dd, float* dst_dd,
														
 
															-    const queue_ptr& main_stream);
														
 
															-
														
 
															-void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
														
 
															-    const ggml_tensor* src1, ggml_tensor* dst,
														
 
															-    const float* src0_dd, const float* src1_dd,
														
 
															-    float* dst_dd,
														
 
															-    const queue_ptr& main_stream);
														
 
															-
														
 
															-void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
														
 
															-    const ggml_tensor* src1, ggml_tensor* dst,
														
 
															-    const float* src0_dd, const float* src1_dd,
														
 
															-    float* dst_dd,
														
 
															-    const queue_ptr& main_stream);
														
 
															-
														
 
															-void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
														
 
															-    const ggml_tensor* src1, ggml_tensor* dst,
														
 
															-    const float* src0_dd, const float* src1_dd,
														
 
															-    float* dst_dd,
														
 
															-    const queue_ptr& main_stream);
														
 
															+void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
														
 
															+
														
 
															+void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
														
 
															+
														
 
															+void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
														
 
															+
														
 
															+void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
														
 
															 #endif // GGML_SYCL_NORM_HPP
														
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -192,18 +192,15 @@ static void rope_neox_sycl(
 
															     }
														
 
															 }
														
 
															-void ggml_sycl_op_rope(
														
 
															-    ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-    const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) {
														
 
															-    const ggml_tensor * src2 = dst->src[2];
														
 
															+void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
														
 
															+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
														
 
															-    GGML_ASSERT(src0->type == dst->type);
														
 
															+    GGML_ASSERT(dst->src[0]->type == dst->type);
														
 
															-    const int64_t ne00 = src0->ne[0];
														
 
															-    const int64_t ne01 = src0->ne[1];
														
 
															-    const int64_t nr = ggml_nrows(src0);
														
 
															+    const int64_t ne00 = dst->src[0]->ne[0];
														
 
															+    const int64_t ne01 = dst->src[0]->ne[1];
														
 
															+    const int64_t nr = ggml_nrows(dst->src[0]);
														
 
															     //const int n_past      = ((int32_t *) dst->op_params)[0];
														
 
															     const int n_dims      = ((int32_t *) dst->op_params)[1];
														
@@ -228,49 +225,47 @@ void ggml_sycl_op_rope(
 
															     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
														
 
															-    const int32_t * pos = (const int32_t *) src1_dd;
														
 
															+    const int32_t * pos = (const int32_t *) dst->src[1]->data;
														
 
															     const float * freq_factors = nullptr;
														
 
															-    if (src2 != nullptr) {
														
 
															-        freq_factors = (const float *) src2->data;
														
 
															+    if (dst->src[2] != nullptr) {
														
 
															+        freq_factors = (const float *) dst->src[2]->data;
														
 
															     }
														
 
															     rope_corr_dims corr_dims;
														
 
															     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
														
 
															+    dpct::queue_ptr main_stream = ctx.stream();
														
 
															+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
														
 
															+
														
 
															     // compute
														
 
															     if (is_neox) {
														
 
															-        if (src0->type == GGML_TYPE_F32) {
														
 
															+        if (dst->src[0]->type == GGML_TYPE_F32) {
														
 
															             rope_neox_sycl(
														
 
															-                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															+                (const float *)dst->src[0]->data, (float *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															                 attn_factor, corr_dims, freq_factors, main_stream
														
 
															             );
														
 
															-        } else if (src0->type == GGML_TYPE_F16) {
														
 
															+        } else if (dst->src[0]->type == GGML_TYPE_F16) {
														
 
															             rope_neox_sycl(
														
 
															-                (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															+                (const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															                 attn_factor, corr_dims, freq_factors, main_stream
														
 
															             );
														
 
															         } else {
														
 
															             GGML_ABORT("fatal error");
														
 
															         }
														
 
															     } else {
														
 
															-        if (src0->type == GGML_TYPE_F32) {
														
 
															+        if (dst->src[0]->type == GGML_TYPE_F32) {
														
 
															             rope_norm_sycl(
														
 
															-                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															+                (const float *)dst->src[0]->data, (float *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															                 attn_factor, corr_dims, freq_factors, main_stream
														
 
															             );
														
 
															-        } else if (src0->type == GGML_TYPE_F16) {
														
 
															+        } else if (dst->src[0]->type == GGML_TYPE_F16) {
														
 
															             rope_norm_sycl(
														
 
															-                (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															+                (const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															                 attn_factor, corr_dims, freq_factors, main_stream
														
 
															             );
														
 
															         } else {
														
 
															             GGML_ABORT("fatal error");
														
 
															         }
														
 
															     }
														
 
															-
														
 
															-    GGML_UNUSED(src1);
														
 
															-    GGML_UNUSED(dst);
														
 
															-    GGML_UNUSED(src1_dd);
														
 
															-    GGML_UNUSED(ctx);
														
 
															 }
														
--- a/ggml/src/ggml-sycl/rope.hpp
+++ b/ggml/src/ggml-sycl/rope.hpp
@@ -15,8 +15,6 @@
 
															 #include "common.hpp"
														
 
															-void ggml_sycl_op_rope(
														
 
															-    ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
														
 
															-    const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream);
														
 
															+void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
														
 
															 #endif // GGML_SYCL_ROPE_HPP