|
|
@@ -49,6 +49,8 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
|
|
FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
|
|
|
}
|
|
|
|
|
|
+
|
|
|
+ bool handled_leftover = false;
|
|
|
if (0 == unaligned_loop) {
|
|
|
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
|
|
|
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
|
|
@@ -60,18 +62,59 @@ void hvx_mul_f32(const uint8_t * restrict src0,
|
|
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
|
|
}
|
|
|
} else {
|
|
|
+ int step_of_1 = num_elems_whole >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
|
|
+ int leftover_size = left_over * sizeof(float);
|
|
|
+
|
|
|
+
|
|
|
+ HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
|
|
|
+ HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
|
|
|
+ HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
|
|
|
+
|
|
|
+ HVX_Vector slinep;
|
|
|
+ HVX_Vector slinec;
|
|
|
+ HVX_Vector sline;
|
|
|
+ HVX_Vector sline2p;
|
|
|
+ HVX_Vector sline2c;
|
|
|
+ HVX_Vector sline2;
|
|
|
+
|
|
|
+ slinep = *vec_in1++;
|
|
|
+ sline2p = *vec_in2++;
|
|
|
#pragma unroll(4)
|
|
|
- for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
|
- HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
|
|
|
- HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
|
|
|
+ for (int i = step_of_1 - 1; i > 0; i--) {
|
|
|
+ slinec = *vec_in1++;
|
|
|
+ sline2c = *vec_in2++;
|
|
|
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
|
|
+ sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
|
|
+
|
|
|
+ *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
|
|
+ slinep = slinec;
|
|
|
+ sline2p = sline2c;
|
|
|
+ }
|
|
|
+ if (step_of_1 > 1) {
|
|
|
+ slinec = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
|
|
|
+ sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
|
|
|
+
|
|
|
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
|
|
+ sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
|
|
+ *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
|
|
|
+ slinep = slinec;
|
|
|
+ sline2p = sline2c;
|
|
|
+ }
|
|
|
+ if (left_over > 0) {
|
|
|
+ slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
|
|
|
|
|
|
- HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
|
|
|
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
|
|
|
+ sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
|
|
|
+ sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
|
|
|
|
|
|
- *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
|
|
+ HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
|
|
|
+ hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
|
|
|
+ handled_leftover = true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (left_over > 0) {
|
|
|
+
|
|
|
+ if (left_over > 0 && !handled_leftover) {
|
|
|
const float * src0f = (const float *) src0 + num_elems_whole;
|
|
|
const float * src1f = (const float *) src1 + num_elems_whole;
|
|
|
float * dstf = (float *) dst + num_elems_whole;
|
|
|
@@ -464,7 +507,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
|
|
}
|
|
|
|
|
|
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
|
|
|
-
|
|
|
+ bool handled_leftover = false;
|
|
|
if (0 == unaligned_loop) {
|
|
|
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
|
|
|
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
|
|
|
@@ -475,17 +518,47 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
|
|
|
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
|
|
|
}
|
|
|
} else {
|
|
|
+ int step_of_1 = num_elems >> 5; // divby 32, because 32 float = 128 bytes per HVX vector
|
|
|
+ int leftover_size = left_over * sizeof(float);
|
|
|
+
|
|
|
+ HVX_Vector * input_v_ptr = (HVX_Vector *) src;
|
|
|
+ HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
|
|
|
+
|
|
|
+ HVX_Vector slinep;
|
|
|
+ HVX_Vector slinec;
|
|
|
+ HVX_Vector sline;
|
|
|
+
|
|
|
+ slinep = *input_v_ptr++;
|
|
|
+
|
|
|
#pragma unroll(4)
|
|
|
- for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
|
- HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
|
|
+ for (int i = step_of_1 - 1; i > 0; i--) {
|
|
|
+ slinec = *input_v_ptr++;
|
|
|
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
|
|
+ *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
|
|
+ /* Prepare slinep for next iteration */
|
|
|
+ slinep = slinec;
|
|
|
+ }
|
|
|
|
|
|
- HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
|
|
|
+ if (step_of_1 > 0) {
|
|
|
+ slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
|
|
|
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
|
|
+ *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
|
|
|
|
|
- *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
|
|
|
+ slinep = slinec;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (leftover_size > 0) {
|
|
|
+ slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
|
|
|
+
|
|
|
+ sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
|
|
|
+
|
|
|
+ HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
|
|
|
+ hvx_vec_store_u(output_v_ptr, leftover_size, sout);
|
|
|
+ handled_leftover = true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (left_over > 0) {
|
|
|
+ if (left_over > 0 && !handled_leftover) {
|
|
|
const float * srcf = (const float *) src + num_elems_whole;
|
|
|
float * dstf = (float *) dst + num_elems_whole;
|
|
|
|