|
|
@@ -1,7 +1,7 @@
|
|
|
#include "common.hpp"
|
|
|
#include "element_wise.hpp"
|
|
|
|
|
|
-void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
|
+static void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
|
const int ne10, const int ne11, const int ne12,
|
|
|
const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
@@ -20,7 +20,7 @@ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-void gelu_f32(const float * x, float * dst, const int k,
|
|
|
+static void gelu_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const float GELU_COEF_A = 0.044715f;
|
|
|
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
|
|
@@ -37,7 +37,7 @@ void gelu_f32(const float * x, float * dst, const int k,
|
|
|
sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
|
|
|
}
|
|
|
|
|
|
-void silu_f32(const float * x, float * dst, const int k,
|
|
|
+static void silu_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -48,7 +48,7 @@ void silu_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
|
|
|
}
|
|
|
|
|
|
-void gelu_quick_f32(const float *x, float *dst, int k,
|
|
|
+static void gelu_quick_f32(const float *x, float *dst, int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const float GELU_QUICK_COEF = -1.702f;
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
@@ -59,7 +59,7 @@ void gelu_quick_f32(const float *x, float *dst, int k,
|
|
|
dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
|
|
}
|
|
|
|
|
|
-void tanh_f32(const float *x, float *dst, int k,
|
|
|
+static void tanh_f32(const float *x, float *dst, int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -69,7 +69,7 @@ void tanh_f32(const float *x, float *dst, int k,
|
|
|
dst[i] = sycl::tanh((float)(x[i]));
|
|
|
}
|
|
|
|
|
|
-void relu_f32(const float * x, float * dst, const int k,
|
|
|
+static void relu_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -80,7 +80,7 @@ void relu_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = sycl::fmax((float)(x[i]), (float)0);
|
|
|
}
|
|
|
|
|
|
-void sigmoid_f32(const float * x, float * dst, const int k,
|
|
|
+static void sigmoid_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -91,7 +91,7 @@ void sigmoid_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
|
|
|
}
|
|
|
|
|
|
-void sqrt_f32(const float * x, float * dst, const int k,
|
|
|
+static void sqrt_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -102,7 +102,7 @@ void sqrt_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = sycl::sqrt(x[i]);
|
|
|
}
|
|
|
|
|
|
-void sin_f32(const float * x, float * dst, const int k,
|
|
|
+static void sin_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -113,7 +113,7 @@ void sin_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = sycl::sin(x[i]);
|
|
|
}
|
|
|
|
|
|
-void cos_f32(const float * x, float * dst, const int k,
|
|
|
+static void cos_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -124,7 +124,7 @@ void cos_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = sycl::cos(x[i]);
|
|
|
}
|
|
|
|
|
|
-void hardsigmoid_f32(const float * x, float * dst, const int k,
|
|
|
+static void hardsigmoid_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -135,7 +135,7 @@ void hardsigmoid_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
|
|
}
|
|
|
|
|
|
-void hardswish_f32(const float * x, float * dst, const int k,
|
|
|
+static void hardswish_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -146,7 +146,7 @@ void hardswish_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
|
|
}
|
|
|
|
|
|
-void exp_f32(const float * x, float * dst, const int k,
|
|
|
+static void exp_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -157,7 +157,7 @@ void exp_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = sycl::exp(x[i]);
|
|
|
}
|
|
|
|
|
|
-void log_f32(const float * x, float * dst, const int k,
|
|
|
+static void log_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -173,7 +173,7 @@ void log_f32(const float * x, float * dst, const int k,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-void neg_f32(const float * x, float * dst, const int k,
|
|
|
+static void neg_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -184,7 +184,7 @@ void neg_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = -x[i];
|
|
|
}
|
|
|
|
|
|
-void step_f32(const float * x, float * dst, const int k,
|
|
|
+static void step_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -195,7 +195,7 @@ void step_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = x[i] > 0.0f;
|
|
|
}
|
|
|
|
|
|
-void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
|
|
|
+static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -206,7 +206,7 @@ void leaky_relu_f32(const float *x, float *dst, const int k, const float negativ
|
|
|
sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
|
|
|
}
|
|
|
|
|
|
-void sqr_f32(const float * x, float * dst, const int k,
|
|
|
+static void sqr_f32(const float * x, float * dst, const int k,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
|
item_ct1.get_local_id(2);
|
|
|
@@ -217,7 +217,7 @@ void sqr_f32(const float * x, float * dst, const int k,
|
|
|
dst[i] = x[i] * x[i];
|
|
|
}
|
|
|
|
|
|
-void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
|
|
+static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
|
|
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
|
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
|
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
|
|
@@ -240,7 +240,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
|
|
dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
|
|
}
|
|
|
|
|
|
-void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
|
|
+static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
|
|
const sycl::nd_item<3> &item_ct1) {
|
|
|
int nidx = item_ct1.get_local_id(2) +
|
|
|
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
|
|
@@ -262,7 +262,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i
|
|
|
|
|
|
|
|
|
|
|
|
-void acc_f32_sycl(const float *x, const float *y, float *dst,
|
|
|
+static void acc_f32_sycl(const float *x, const float *y, float *dst,
|
|
|
const int n_elements, const int ne10, const int ne11,
|
|
|
const int ne12, const int nb1, const int nb2,
|
|
|
const int offset, queue_ptr stream) {
|
|
|
@@ -277,7 +277,7 @@ void acc_f32_sycl(const float *x, const float *y, float *dst,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void gelu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void gelu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -289,7 +289,7 @@ void gelu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void silu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void silu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -301,7 +301,7 @@ void silu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -313,7 +313,7 @@ void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void tanh_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void tanh_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -325,7 +325,7 @@ void tanh_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -337,7 +337,7 @@ void relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -349,7 +349,7 @@ void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void hardswish_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void hardswish_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -361,7 +361,7 @@ void hardswish_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void exp_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void exp_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -373,7 +373,7 @@ void exp_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void log_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void log_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -385,7 +385,7 @@ void log_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void neg_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void neg_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -397,7 +397,7 @@ void neg_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void step_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void step_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -409,7 +409,7 @@ void step_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void sigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -421,7 +421,7 @@ void sigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void sqrt_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void sqrt_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -433,7 +433,7 @@ void sqrt_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void sin_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void sin_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -445,7 +445,7 @@ void sin_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void cos_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void cos_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -457,7 +457,7 @@ void cos_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
const float negative_slope,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
|
|
@@ -470,7 +470,7 @@ void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void sqr_f32_sycl(const float *x, float *dst, const int k,
|
|
|
+static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
|
|
queue_ptr stream) {
|
|
|
const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
|
|
|
stream->parallel_for(
|
|
|
@@ -482,7 +482,7 @@ void sqr_f32_sycl(const float *x, float *dst, const int k,
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
|
|
+static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
|
|
|
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
|
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
|
const float sf2, const float sf3, queue_ptr stream) {
|
|
|
@@ -496,7 +496,7 @@ void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01
|
|
|
});
|
|
|
}
|
|
|
|
|
|
-void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
|
|
+static void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
|
|
const int ne01, const int ne02, const int ne0,
|
|
|
const int ne1, const int ne2, queue_ptr stream) {
|
|
|
int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
|