|
@@ -911,6 +911,98 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
|
|
|
});
|
|
});
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+__dpct_inline__ float ggml_sycl_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
|
|
|
|
|
+ x = sycl::fmin(x, limit);
|
|
|
|
|
+ g = sycl::fmax(sycl::fmin(g, limit), -limit);
|
|
|
|
|
+
|
|
|
|
|
+ float out_glu = x / (1.0f + sycl::native::exp(-x * alpha));
|
|
|
|
|
+ out_glu = out_glu * (1.0f + g);
|
|
|
|
|
+ return out_glu;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+template <typename T>
|
|
|
|
|
+static void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k,
|
|
|
|
|
+ const int64_t n, const int64_t o0, const int64_t o1,
|
|
|
|
|
+ float alpha, float limit, sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ const int64_t i = int64_t(item_ct1.get_local_range(2)) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
|
|
|
|
|
+
|
|
|
|
|
+ if (i >= k) {
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const int64_t j0 = (i / n) * o0 + (i % n);
|
|
|
|
|
+ const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
|
|
|
|
|
+
|
|
|
|
|
+ float xi = x[j0];
|
|
|
|
|
+ float gi = g[j1];
|
|
|
|
|
+
|
|
|
|
|
+ dst[i] = ggml_sycl_op_swiglu_oai_single(xi, gi, alpha, limit);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+template <typename T>
|
|
|
|
|
+static void swiglu_oai_sycl(const T * x,
|
|
|
|
|
+ const T * g,
|
|
|
|
|
+ T * dst,
|
|
|
|
|
+ const int64_t k,
|
|
|
|
|
+ const int64_t n,
|
|
|
|
|
+ const int64_t o0,
|
|
|
|
|
+ const int64_t o1,
|
|
|
|
|
+ const float alpha,
|
|
|
|
|
+ const float limit,
|
|
|
|
|
+ dpct::queue_ptr stream) {
|
|
|
|
|
+ const int64_t num_blocks = (k + SYCL_GLU_BLOCK_SIZE - 1) / SYCL_GLU_BLOCK_SIZE;
|
|
|
|
|
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE),
|
|
|
|
|
+ sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE)),
|
|
|
|
|
+ [=](sycl::nd_item<3> item_ct1) {
|
|
|
|
|
+ swiglu_oai_kernel(x, g, dst, k, n, o0, o1, alpha, limit, item_ct1);
|
|
|
|
|
+ });
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+void ggml_sycl_op_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
|
|
|
+ const ggml_tensor * src0 = dst->src[0];
|
|
|
|
|
+ const ggml_tensor * src1 = dst->src[1];
|
|
|
|
|
+ void * src0_d = src0->data;
|
|
|
|
|
+ void * src1_d = src1 ? src1->data : src0->data;
|
|
|
|
|
+ const int64_t src0_o = src0->nb[1];
|
|
|
|
|
+ const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
|
|
|
|
+ void * dst_d = dst->data;
|
|
|
|
|
+ const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
|
|
|
|
+ dpct::queue_ptr stream = ctx.stream();
|
|
|
|
|
+
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
|
|
|
+ GGML_ASSERT(src0->nb[0] == ggml_element_size(src0));
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous(dst));
|
|
|
|
|
+
|
|
|
|
|
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
|
|
|
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
|
|
|
+ GGML_ASSERT(src0->type == dst->type);
|
|
|
|
|
+ GGML_ASSERT(dst->ne[0] == nc);
|
|
|
|
|
+ GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
|
|
|
|
|
+
|
|
|
|
|
+ if (src1) {
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
|
|
|
+ GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
|
|
|
|
|
+ GGML_ASSERT(src1->ne[0] == nc);
|
|
|
|
|
+ GGML_ASSERT(src0->type == src1->type);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ //const int32_t swapped = ((const int32_t *) dst->op_params)[1];
|
|
|
|
|
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
|
|
|
+ const float alpha = ggml_get_op_params_f32(dst, 2);
|
|
|
|
|
+ const float limit = ggml_get_op_params_f32(dst, 3);
|
|
|
|
|
+
|
|
|
|
|
+ float * src0_p = (float *) src0_d;
|
|
|
|
|
+ float * src1_p = (float *) src1_d;
|
|
|
|
|
+
|
|
|
|
|
+ if (!src1) {
|
|
|
|
|
+ src0_p += swapped ? nc : 0;
|
|
|
|
|
+ src1_p += swapped ? 0 : nc;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ swiglu_oai_sycl(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
|
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
|
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
|
|
|
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
|
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
|
|
@@ -1070,6 +1162,11 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
|
ggml_sycl_op_swiglu(ctx, dst);
|
|
ggml_sycl_op_swiglu(ctx, dst);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+void ggml_sycl_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
|
|
|
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
|
|
|
+ ggml_sycl_op_swiglu_oai(ctx, dst);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
|
|
ggml_sycl_op_geglu_erf(ctx, dst);
|
|
ggml_sycl_op_geglu_erf(ctx, dst);
|