|
|
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
|
|
|
float lmax = -INFINITY;
|
|
|
|
|
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
|
- lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
|
|
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
|
}
|
|
|
|
|
|
// find the max value in the block
|
|
|
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
|
|
|
// parallel sum
|
|
|
float lsum = 0.0f;
|
|
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
|
- const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
|
|
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
|
lsum += exp_psrc0;
|
|
|
pdst[i00] = exp_psrc0;
|
|
|
}
|
|
|
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
|
|
|
float4 lmax4 = -INFINITY;
|
|
|
|
|
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
|
- lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
|
|
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
|
|
}
|
|
|
|
|
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
|
|
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
|
|
|
// parallel sum
|
|
|
float4 lsum4 = 0.0f;
|
|
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
|
- const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
|
|
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
|
|
lsum4 += exp_psrc4;
|
|
|
pdst4[i00] = exp_psrc4;
|
|
|
}
|