|
|
@@ -263,7 +263,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
|
|
struct htp_spad * dst_spad,
|
|
|
uint32_t nth,
|
|
|
uint32_t ith,
|
|
|
- uint32_t src0_nrows_per_thread) {
|
|
|
+ uint32_t src0_nrows_per_thread,
|
|
|
+ dma_queue * dma_queue) {
|
|
|
htp_act_preamble2;
|
|
|
|
|
|
uint64_t t1, t2;
|
|
|
@@ -271,6 +272,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
|
|
|
|
|
const size_t src0_row_size = nb01;
|
|
|
const size_t dst_row_size = nb1;
|
|
|
+ const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
|
|
|
+ const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
|
|
|
|
|
|
const uint32_t src0_nrows = ne01 * ne02 * ne03;
|
|
|
|
|
|
@@ -282,60 +285,81 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
- int is_aligned = 1;
|
|
|
- int opt_path = 0;
|
|
|
- if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
|
|
|
- is_aligned = 0;
|
|
|
- FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
|
|
|
- }
|
|
|
- if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
|
|
|
- opt_path = 1;
|
|
|
+ const uint8_t * data_src0 = (const uint8_t *) src0->data;
|
|
|
+ uint8_t * data_dst = (uint8_t *) dst->data;
|
|
|
+
|
|
|
+ uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
|
|
|
+ uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread);
|
|
|
+
|
|
|
+ // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
|
|
|
+ size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
|
|
|
+ size_t dst_spad_half_size = dst_spad->size_per_thread / 2;
|
|
|
+
|
|
|
+ // In gelu = x*sigmoid(x*1.702)
|
|
|
+ const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
|
|
|
+
|
|
|
+ if (BLOCK == 0) {
|
|
|
+ FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
|
|
|
+ src0_spad->size_per_thread, src0_row_size_aligned);
|
|
|
+ return;
|
|
|
}
|
|
|
|
|
|
- const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
|
|
|
- uint8_t * restrict data_dst = (uint8_t *) dst->data;
|
|
|
+ // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
|
|
|
+ for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
|
|
|
+ const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
|
|
|
|
|
|
- uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
|
|
|
- uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
|
|
|
+ // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
|
|
|
+ dma_queue_push_vtcm_to_ddr(dma_queue,
|
|
|
+ dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
|
|
|
+ dst_row_size, dst_row_size_aligned, 0);
|
|
|
+
|
|
|
+ dma_queue_push_ddr_to_vtcm(dma_queue,
|
|
|
+ dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src0 + (ir * src0_row_size)),
|
|
|
+ src0_row_size_aligned, src0_row_size, block_size);
|
|
|
+ }
|
|
|
|
|
|
- const int BLOCK = 8;
|
|
|
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
|
|
|
- const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
|
|
|
+ const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
|
|
|
|
|
|
- // Prefetch next block
|
|
|
- if (block_end < src0_end_row) {
|
|
|
- const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
|
|
|
- htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
|
|
|
- }
|
|
|
+ float* dst_spad = (float *) dma_queue_pop(dma_queue).src;
|
|
|
+ float* src0_spad = (float *) dma_queue_pop(dma_queue).dst;
|
|
|
|
|
|
- // Process rows in current block
|
|
|
- for (uint32_t ib = ir; ib < block_end; ib++) {
|
|
|
- const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
|
|
|
- float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
|
|
|
+ for (uint32_t ib = 0; ib < block_size; ib++) {
|
|
|
+ const float* src0_spad_ptr = src0_spad + ib * (src0_row_size_aligned / sizeof(float));
|
|
|
+ float* dst_spad_ptr = dst_spad + ib * (dst_row_size_aligned / sizeof(float));
|
|
|
|
|
|
// gelu = x * sigmoid(1.702 * x) // current implementation
|
|
|
- if (1 == opt_path) {
|
|
|
- hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
|
|
|
- hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
|
|
- hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
|
|
- } else {
|
|
|
- hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
|
|
|
- hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
|
|
|
- hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
|
|
|
- }
|
|
|
+ hvx_mul_scalar_f32((const uint8_t *) src0_spad_ptr, (float) 1.702, (uint8_t *) dst_spad_ptr, ne0);
|
|
|
+ hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
|
|
|
+ hvx_mul_f32_opt((const uint8_t *) src0_spad_ptr, (uint8_t *) dst_spad_ptr, (uint8_t *) dst_spad_ptr, ne0);
|
|
|
+ }
|
|
|
+
|
|
|
+ dma_queue_push_vtcm_to_ddr(dma_queue,
|
|
|
+ dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
|
|
|
+ dst_row_size, dst_row_size_aligned, block_size);
|
|
|
+
|
|
|
+ // prefetch N+2 loop iteration if any
|
|
|
+ const uint32_t pref_block = (ir + BLOCK * 2);
|
|
|
+ if (pref_block < src0_end_row) {
|
|
|
+ const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
|
|
|
+ dma_queue_push_ddr_to_vtcm(dma_queue,
|
|
|
+ dma_make_ptr(src0_spad, data_src0 + (pref_block * src0_row_size)),
|
|
|
+ src0_row_size_aligned, src0_row_size, pref_block_size);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ dma_queue_flush(dma_queue);
|
|
|
+
|
|
|
t2 = HAP_perf_get_qtimer_count();
|
|
|
|
|
|
- FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
|
|
|
+ FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
|
|
|
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
|
|
}
|
|
|
|
|
|
static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
|
|
|
struct htp_ops_context * octx = (struct htp_ops_context *) data;
|
|
|
unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
|
|
|
- octx->src0_nrows_per_thread);
|
|
|
+ octx->src0_nrows_per_thread, octx->ctx->dma[i]);
|
|
|
}
|
|
|
|
|
|
|
|
|
@@ -468,21 +492,45 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
|
|
|
const uint32_t n_threads = octx->n_threads;
|
|
|
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
|
|
|
|
|
|
- const size_t src0_row_size = src0->nb[1];
|
|
|
- const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
|
|
|
- const size_t dst_row_size = dst->nb[1];
|
|
|
+ size_t src0_row_size = src0->nb[1];
|
|
|
+ size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
|
|
|
+ size_t dst_row_size = dst->nb[1];
|
|
|
+
|
|
|
+ const bool src1_valid = src1->ne[0];
|
|
|
+ if (!src1_valid) {
|
|
|
+ src1_row_size = src0_row_size;
|
|
|
+ }
|
|
|
|
|
|
+ const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
|
|
|
+ const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
|
|
|
+ const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
|
|
|
// VTCM scratchpads for all tensors
|
|
|
// N rows per thread, padded to HVX vector size
|
|
|
- octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads;
|
|
|
- octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
|
|
|
- octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
|
|
|
|
|
|
- size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
|
|
|
+ size_t spad_size_per_row = (src0_row_size_aligned + src1_row_size_aligned) + dst_row_size_aligned;
|
|
|
+ size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
|
|
|
+
|
|
|
+ // Make sure the reserved vtcm size is sufficient
|
|
|
+ if(vtcm_row_per_thread ==0){
|
|
|
+ FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
|
|
|
+ spad_size_per_row * n_threads);
|
|
|
+ return HTP_STATUS_VTCM_TOO_SMALL;
|
|
|
+ }
|
|
|
+
|
|
|
+ octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
|
|
|
+ octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
|
|
|
+ octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread;
|
|
|
+
|
|
|
+ octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread;
|
|
|
+ octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
|
|
|
+ octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
|
|
|
+
|
|
|
+ octx->src0_spad.data = octx->ctx->vtcm_base;
|
|
|
+ octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
|
|
+ octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
|
|
|
|
|
if (src1->ne[0]) {
|
|
|
- FARF(HIGH,
|
|
|
- "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
|
|
|
+ FARF(HIGH, "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
|
|
|
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
|
|
|
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
|
|
|
octx->dst_spad.size);
|
|
|
@@ -492,20 +540,8 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
|
|
|
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
|
|
|
}
|
|
|
|
|
|
- // Make sure the reserved vtcm size is sufficient
|
|
|
- if (octx->ctx->vtcm_size < spad_size) {
|
|
|
- FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
|
|
|
- spad_size);
|
|
|
- return HTP_STATUS_VTCM_TOO_SMALL;
|
|
|
- }
|
|
|
-
|
|
|
- octx->src0_spad.data = octx->ctx->vtcm_base;
|
|
|
- octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
|
|
- octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
|
|
|
-
|
|
|
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
|
|
|
uint32_t n_jobs = MIN(n_threads, src0_nrows);
|
|
|
-
|
|
|
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
|
|
|
worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
|
|
|
}
|