1 месяц назад · d5bc1ad110
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -240,6 +240,23 @@ struct ggml_hexagon_session {
 
				     uint32_t         prof_pkts;
			
 
				 };
			
 
				 
			
 
				+static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
			
 
				+    char dims[64 * GGML_MAX_SRC];
			
 
				+    char strides[64 * GGML_MAX_SRC];
			
 
				+    char types[16 * GGML_MAX_SRC];
			
 
				+    char buffs[64 * GGML_MAX_SRC];
			
 
				+    char names[64 * GGML_MAX_SRC];
			
 
				+
			
 
				+    hex_format_op_dims(dims, op);
			
 
				+    hex_format_op_strides(strides, op);
			
 
				+    hex_format_op_types(types, op);
			
 
				+    hex_format_op_buffs(buffs, op);
			
 
				+    hex_format_op_names(names, op);
			
 
				+
			
 
				+    HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
			
 
				+                names, dims, types, strides, buffs, req_flags);
			
 
				+}
			
 
				+
			
 
				 void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
			
 
				     // Bump pending flag (cleared in the session::flush once we get the responce)
			
 
				     this->op_pending++;  // atomic inc
			
@@ -1912,6 +1929,15 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
 
				     return true;
			
 
				 }
			
 
				 
			
 
				+template <typename... _TTensor>
			
 
				+static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
			
 
				+    return ([&]() -> bool {
			
 
				+        return !tensors || !tensors->buffer ||
			
 
				+               (ggml_backend_buffer_is_hexagon(tensors->buffer) &&
			
 
				+                ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
			
 
				+    }() && ...);
			
 
				+}
			
 
				+
			
 
				 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
			
 
				     const struct ggml_tensor * src0 = dst->src[0];
			
 
				     const struct ggml_tensor * src1 = dst->src[1];
			
@@ -1959,16 +1985,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
 
				     }
			
 
				 
			
 
				     // src0 & src1 & dst must be mapped to the same session
			
 
				-    if (src0->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src1->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (dst->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
			
 
				+    if (!hex_supported_buffer(sess, src0, src1, dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2016,20 +2033,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
 
				 
			
 
				     // src0 (weights) must be repacked and mapped to the same session
			
 
				     // src1 & sr2 & dst must be mapped to the same session
			
 
				-    if (src0->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src1->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src2->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (dst->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
			
 
				+    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2063,16 +2067,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
 
				     }
			
 
				 
			
 
				     // src0, src1 & dst must be mapped to the same session
			
 
				-    if (src0->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src1->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (dst->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
			
 
				+    if (!hex_supported_buffer(sess, src0, src1, dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2104,20 +2099,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
 
				     }
			
 
				 
			
 
				     // src0, src1 & dst must be mapped to the same session
			
 
				-    if (src0->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src1->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src2->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (dst->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
			
 
				+    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2144,12 +2126,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
 
				     }
			
 
				 
			
 
				     // src0 & dst must be mapped to the same session
			
 
				-    if (src0->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (dst->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
			
 
				+    if (!hex_supported_buffer(sess, src0, dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2186,16 +2163,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
 
				     }
			
 
				 
			
 
				     // src0, src1 & dst must be mapped to the same session
			
 
				-    if (src0->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src1 && src1->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (dst->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
			
 
				+    if (!hex_supported_buffer(sess, src0, src1, dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2248,16 +2216,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
 
				     }
			
 
				 
			
 
				     // src0, src1 & dst must be mapped to the same session
			
 
				-    if (src0->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src1 && src1->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (dst->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
			
 
				+    if (!hex_supported_buffer(sess, src0, src1, dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2312,20 +2271,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
 
				     }
			
 
				 
			
 
				     // src0, src1, src2 & dst must be mapped to the same session
			
 
				-    if (src0->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src1->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (src2 && src2->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
			
 
				-        return false;
			
 
				-    }
			
 
				-    if (dst->buffer &&
			
 
				-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
			
 
				+    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2346,6 +2292,26 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
 
				     h->nb[3] = t->nb[3];
			
 
				 }
			
 
				 
			
 
				+static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
			
 
				+    if (!t) {
			
 
				+        return 0;
			
 
				+    }
			
 
				+
			
 
				+    memset(buf, 0, sizeof(*buf));
			
 
				+    auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
			
 
				+    buf->fd      = tensor_buf->fd;
			
 
				+    buf->ptr     = t->data;
			
 
				+    buf->offset  = (uint8_t *) t->data - tensor_buf->base;
			
 
				+    buf->size    = ggml_nbytes(t);
			
 
				+    buf->flags   = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);        // Flush CPU
			
 
				+    buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0);  // Invalidate DSP
			
 
				+    return 1;
			
 
				+}
			
 
				+
			
 
				+static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
			
 
				+    return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
			
 
				+}
			
 
				+
			
 
				 static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
			
 
				     auto buf  = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
			
 
				     auto sess = buf->sess;
			
@@ -2360,10 +2326,6 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
 
				     const struct ggml_tensor * src1 = op->src[1];
			
 
				     const struct ggml_tensor * dst  = op;
			
 
				 
			
 
				-    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
			
 
				-    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
			
 
				-    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
			
 
				-
			
 
				     uint64_t t1, t2;
			
 
				     t1 = ggml_time_us();
			
 
				 
			
@@ -2385,55 +2347,27 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
 
				     }
			
 
				 
			
 
				     dspqueue_buffer bufs[3];
			
 
				-    memset(bufs, 0, sizeof(bufs));
			
 
				 
			
 
				     // First buffer Weights.
			
 
				     // The content is static, there is no need to do any cache management
			
 
				-    bufs[0].fd     = src0_buf->fd;
			
 
				-    bufs[0].ptr    = src0->data;
			
 
				-    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
			
 
				-    bufs[0].size   = ggml_nbytes(src0);
			
 
				-    bufs[0].flags  = 0;
			
 
				+    dspqueue_buffers_init(bufs, src0, false, false);
			
 
				 
			
 
				     // Second buffer Input Activations. This is a buffer that the CPU
			
 
				     // writes and the DSP reads, so we'll need to flush CPU caches and
			
 
				     // invalidate DSP ones. On platforms with I/O coherency support the
			
 
				     // framework will automatically skip cache operations where possible.
			
 
				-    bufs[1].fd     = src1_buf->fd;
			
 
				-    bufs[1].ptr    = src1->data;
			
 
				-    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
			
 
				-    bufs[1].size   = ggml_nbytes(src1);
			
 
				-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				+    dspqueue_buffers_init(&bufs[1], src1, true, true);
			
 
				 
			
 
				     // Third buffer Output Activations. We'll handle DSP
			
 
				     // cache maintenance in the response message but need to flush
			
 
				     // CPU caches to ensure any previously written dirty lines are
			
 
				     // written out before writes from the DSP start.
			
 
				-    bufs[2].fd     = dst_buf->fd;
			
 
				-    bufs[2].ptr    = dst->data;
			
 
				-    bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
			
 
				-    bufs[2].size   = ggml_nbytes(dst);
			
 
				-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
			
 
				+    dspqueue_buffers_init(&bufs[2], dst, true, false);
			
 
				 
			
 
				-    // Primary DSP session from the src0 (normally weight) tensor
			
 
				-    auto sess = src0_buf->sess;
			
 
				+    auto * sess = get_session_from_tensor(src0);
			
 
				 
			
 
				     if (opt_verbose) {
			
 
				-        char dims[64 * GGML_MAX_SRC];
			
 
				-        char strides[64 * GGML_MAX_SRC];
			
 
				-        char types[16 * GGML_MAX_SRC];
			
 
				-        char buffs[64 * GGML_MAX_SRC];
			
 
				-        char names[64 * GGML_MAX_SRC];
			
 
				-
			
 
				-        hex_format_op_dims(dims, op);
			
 
				-        hex_format_op_strides(strides, op);
			
 
				-        hex_format_op_types(types, op);
			
 
				-        hex_format_op_buffs(buffs, op);
			
 
				-        hex_format_op_names(names, op);
			
 
				-
			
 
				-        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
			
 
				-                    names, dims, types, strides, buffs, req.flags);
			
 
				+        hex_print_op_info(op, sess, req.flags);
			
 
				         if (opt_verbose > 1) {
			
 
				             hex_dump_dspbuf(src0, &bufs[0]);
			
 
				             hex_dump_dspbuf(src1, &bufs[1]);
			
@@ -2463,11 +2397,6 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
 
				     const struct ggml_tensor * src2 = op->src[2];
			
 
				     const struct ggml_tensor * dst  = op;
			
 
				 
			
 
				-    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
			
 
				-    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
			
 
				-    auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
			
 
				-    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
			
 
				-
			
 
				     uint64_t t1, t2;
			
 
				     t1 = ggml_time_us();
			
 
				 
			
@@ -2490,66 +2419,32 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
 
				     }
			
 
				 
			
 
				     dspqueue_buffer bufs[4];
			
 
				-    memset(bufs, 0, sizeof(bufs));
			
 
				-
			
 
				     // First buffer Weights.
			
 
				     // The content is static, there is no need to do any cache management
			
 
				-    bufs[0].fd     = src0_buf->fd;
			
 
				-    bufs[0].ptr    = src0->data;
			
 
				-    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
			
 
				-    bufs[0].size   = ggml_nbytes(src0);
			
 
				-    bufs[0].flags  = 0;
			
 
				+    dspqueue_buffers_init(bufs, src0, false, false);
			
 
				 
			
 
				     // Second buffer Input Activations. This is a buffer that the CPU
			
 
				     // writes and the DSP reads, so we'll need to flush CPU caches and
			
 
				     // invalidate DSP ones. On platforms with I/O coherency support the
			
 
				     // framework will automatically skip cache operations where possible.
			
 
				-    bufs[1].fd     = src1_buf->fd;
			
 
				-    bufs[1].ptr    = src1->data;
			
 
				-    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
			
 
				-    bufs[1].size   = ggml_nbytes(src1);
			
 
				-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				+    dspqueue_buffers_init(&bufs[1], src1, true, true);
			
 
				 
			
 
				     // Third buffer expert IDs. This is a buffer that the CPU
			
 
				     // writes and the DSP reads, so we'll need to flush CPU caches and
			
 
				     // invalidate DSP ones. On platforms with I/O coherency support the
			
 
				     // framework will automatically skip cache operations where possible.
			
 
				-    bufs[2].fd     = src2_buf->fd;
			
 
				-    bufs[2].ptr    = src2->data;
			
 
				-    bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
			
 
				-    bufs[2].size   = ggml_nbytes(src2);
			
 
				-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				+    dspqueue_buffers_init(&bufs[2], src2, true, true);
			
 
				 
			
 
				     // Forth buffer Output Activations. We'll handle DSP
			
 
				     // cache maintenance in the response message but need to flush
			
 
				     // CPU caches to ensure any previously written dirty lines are
			
 
				     // written out before writes from the DSP start.
			
 
				-    bufs[3].fd     = dst_buf->fd;
			
 
				-    bufs[3].ptr    = dst->data;
			
 
				-    bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
			
 
				-    bufs[3].size   = ggml_nbytes(dst);
			
 
				-    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
			
 
				+    dspqueue_buffers_init(&bufs[3], dst, true, false);
			
 
				 
			
 
				-    // Primary DSP session from the src0 (normally weight) tensor
			
 
				-    auto sess = src0_buf->sess;
			
 
				+    auto * sess = get_session_from_tensor(src0);
			
 
				 
			
 
				     if (opt_verbose) {
			
 
				-        char dims[64 * GGML_MAX_SRC];
			
 
				-        char strides[64 * GGML_MAX_SRC];
			
 
				-        char types[16 * GGML_MAX_SRC];
			
 
				-        char buffs[64 * GGML_MAX_SRC];
			
 
				-        char names[64 * GGML_MAX_SRC];
			
 
				-
			
 
				-        hex_format_op_dims(dims, op);
			
 
				-        hex_format_op_types(types, op);
			
 
				-        hex_format_op_buffs(buffs, op);
			
 
				-        hex_format_op_names(names, op);
			
 
				-
			
 
				-        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
			
 
				-                    names, dims, types, strides, buffs, req.flags);
			
 
				-
			
 
				+        hex_print_op_info(op, sess, req.flags);
			
 
				         if (opt_verbose > 1) {
			
 
				             hex_dump_dspbuf(src0, &bufs[0]);
			
 
				             hex_dump_dspbuf(src1, &bufs[1]);
			
@@ -2581,10 +2476,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
 
				     const struct ggml_tensor * src1 = node->src[1];
			
 
				     const struct ggml_tensor * dst  = node;
			
 
				 
			
 
				-    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
			
 
				-    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
			
 
				-    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
			
 
				-
			
 
				     uint64_t t1 = 0;
			
 
				     uint64_t t2 = 0;
			
 
				 
			
@@ -2621,60 +2512,30 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
 
				     init_htp_tensor(&req.dst, dst);
			
 
				 
			
 
				     dspqueue_buffer bufs[3];
			
 
				-    memset(bufs, 0, sizeof(bufs));
			
 
				-
			
 
				     // First buffer = First Operand of Binary op
			
 
				     // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				     // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				     // with I/O coherency support the framework will automatically skip
			
 
				     // cache operations where possible.
			
 
				-    bufs[0].fd     = src0_buf->fd;
			
 
				-    bufs[0].ptr    = src0->data;
			
 
				-    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
			
 
				-    bufs[0].size   = ggml_nbytes(src0);
			
 
				-    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
			
 
				+    dspqueue_buffers_init(bufs, src0, true, true);
			
 
				 
			
 
				     // Second buffer = Second Operand of Binary op
			
 
				     // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				     // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				     // with I/O coherency support the framework will automatically skip
			
 
				     // cache operations where possible.
			
 
				-    bufs[1].fd     = src1_buf->fd;
			
 
				-    bufs[1].ptr    = src1->data;
			
 
				-    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
			
 
				-    bufs[1].size   = ggml_nbytes(src1);
			
 
				-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				+    dspqueue_buffers_init(&bufs[1], src1, true, true);
			
 
				 
			
 
				     // Third buffer = Output Activations. We'll handle DSP
			
 
				     // cache maintenance in the response message but need to flush
			
 
				     // CPU caches to ensure any previously written dirty lines are
			
 
				     // written out before writes from the DSP start.
			
 
				-    bufs[2].fd     = dst_buf->fd;
			
 
				-    bufs[2].ptr    = dst->data;
			
 
				-    bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
			
 
				-    bufs[2].size   = ggml_nbytes(dst);
			
 
				-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
			
 
				+    dspqueue_buffers_init(&bufs[2], dst, true, false);
			
 
				 
			
 
				-    // Primary DSP session from the src0 tensor
			
 
				-    ggml_hexagon_session * sess = src0_buf->sess;
			
 
				+    auto * sess = get_session_from_tensor(src0);
			
 
				 
			
 
				     if (opt_verbose) {
			
 
				-        char dims[64 * GGML_MAX_SRC];
			
 
				-        char strides[16 * GGML_MAX_SRC];
			
 
				-        char types[16 * GGML_MAX_SRC];
			
 
				-        char buffs[64 * GGML_MAX_SRC];
			
 
				-        char names[64 * GGML_MAX_SRC];
			
 
				-
			
 
				-        hex_format_op_dims(dims, op);
			
 
				-        hex_format_op_strides(strides, op);
			
 
				-        hex_format_op_types(types, op);
			
 
				-        hex_format_op_buffs(buffs, op);
			
 
				-        hex_format_op_names(names, op);
			
 
				-
			
 
				-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
			
 
				-                    ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
			
 
				+        hex_print_op_info(op, sess, req.flags);
			
 
				         if (opt_verbose > 1) {
			
 
				             hex_dump_dspbuf(src0, &bufs[0]);
			
 
				             hex_dump_dspbuf(src1, &bufs[1]);
			
@@ -2705,11 +2566,6 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
 
				     const struct ggml_tensor * src2 = node->src[2];
			
 
				     const struct ggml_tensor * dst  = node;
			
 
				 
			
 
				-    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
			
 
				-    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
			
 
				-    auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
			
 
				-    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
			
 
				-
			
 
				     uint64_t t1 = 0;
			
 
				     uint64_t t2 = 0;
			
 
				 
			
@@ -2741,58 +2597,19 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
 
				     init_htp_tensor(&req.dst, dst);
			
 
				 
			
 
				     dspqueue_buffer bufs[4];
			
 
				-    memset(bufs, 0, sizeof(bufs));
			
 
				-
			
 
				     // First buffer = input activations
			
 
				-    bufs[0].fd     = src0_buf->fd;
			
 
				-    bufs[0].ptr    = src0->data;
			
 
				-    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
			
 
				-    bufs[0].size   = ggml_nbytes(src0);
			
 
				-    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
			
 
				-
			
 
				+    dspqueue_buffers_init(bufs, src0, true, true);
			
 
				     // Second buffer = experts bias
			
 
				-    bufs[1].fd     = src1_buf->fd;
			
 
				-    bufs[1].ptr    = src1->data;
			
 
				-    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
			
 
				-    bufs[1].size   = ggml_nbytes(src1);
			
 
				-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				-
			
 
				+    dspqueue_buffers_init(&bufs[1], src1, true, true);
			
 
				     // Third buffer = activated experts
			
 
				-    bufs[2].fd     = src2_buf->fd;
			
 
				-    bufs[2].ptr    = src2->data;
			
 
				-    bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
			
 
				-    bufs[2].size   = ggml_nbytes(src2);
			
 
				-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				-
			
 
				+    dspqueue_buffers_init(&bufs[2], src2, true, true);
			
 
				     // Forth buffer = output activations
			
 
				-    bufs[3].fd     = dst_buf->fd;
			
 
				-    bufs[3].ptr    = dst->data;
			
 
				-    bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
			
 
				-    bufs[3].size   = ggml_nbytes(dst);
			
 
				-    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
			
 
				+    dspqueue_buffers_init(&bufs[3], dst, true, true);
			
 
				 
			
 
				-    // Primary DSP session from the src0 tensor
			
 
				-    ggml_hexagon_session * sess = src0_buf->sess;
			
 
				+    auto * sess = get_session_from_tensor(src0);
			
 
				 
			
 
				     if (opt_verbose) {
			
 
				-        char dims[64 * GGML_MAX_SRC];
			
 
				-        char strides[16 * GGML_MAX_SRC];
			
 
				-        char types[16 * GGML_MAX_SRC];
			
 
				-        char buffs[64 * GGML_MAX_SRC];
			
 
				-        char names[64 * GGML_MAX_SRC];
			
 
				-
			
 
				-        hex_format_op_dims(dims, op);
			
 
				-        hex_format_op_strides(strides, op);
			
 
				-        hex_format_op_types(types, op);
			
 
				-        hex_format_op_buffs(buffs, op);
			
 
				-        hex_format_op_names(names, op);
			
 
				-
			
 
				-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
			
 
				-                    ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
			
 
				-
			
 
				+        hex_print_op_info(op, sess, req.flags);
			
 
				         if (opt_verbose > 1) {
			
 
				             hex_dump_dspbuf(src0, &bufs[0]);
			
 
				             hex_dump_dspbuf(src1, &bufs[1]);
			
@@ -2886,71 +2703,33 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
 
				     }
			
 
				 
			
 
				     dspqueue_buffer bufs[3];
			
 
				-    int             n_bufs = 0;
			
 
				-
			
 
				-    memset(bufs, 0, sizeof(bufs));
			
 
				 
			
 
				     // First buffer = Only Operand of Unary op
			
 
				     // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				     // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				     // with I/O coherency support the framework will automatically skip
			
 
				     // cache operations where possible.
			
 
				-    auto src0_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
			
 
				-    bufs[n_bufs].fd     = src0_buf->fd;
			
 
				-    bufs[n_bufs].ptr    = src0->data;
			
 
				-    bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
			
 
				-    bufs[n_bufs].size   = ggml_nbytes(src0);
			
 
				-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
			
 
				-    ++n_bufs;
			
 
				+    size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
			
 
				 
			
 
				-    if (src1) {
			
 
				-        // Second buffer = Second Operand of Binary op
			
 
				-        // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				-        // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				-        // with I/O coherency support the framework will automatically skip
			
 
				-        // cache operations where possible.
			
 
				-        auto src1_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
			
 
				-        bufs[n_bufs].fd     = src1_buf->fd;
			
 
				-        bufs[n_bufs].ptr    = src1->data;
			
 
				-        bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
			
 
				-        bufs[n_bufs].size   = ggml_nbytes(src1);
			
 
				-        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				-        ++n_bufs;
			
 
				-    }
			
 
				+    // Second buffer(nullable) = Second Operand of Binary op
			
 
				+    // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				+    // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				+    // with I/O coherency support the framework will automatically skip
			
 
				+    // cache operations where possible.
			
 
				+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
			
 
				 
			
 
				     // Second or third buffer = Output Activations. We'll handle DSP
			
 
				     // Second buffer = Output Activations. We'll handle DSP
			
 
				     // cache maintenance in the response message but need to flush
			
 
				     // CPU caches to ensure any previously written dirty lines are
			
 
				     // written out before writes from the DSP start.
			
 
				-    auto dst_buf        = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
			
 
				-    bufs[n_bufs].fd     = dst_buf->fd;
			
 
				-    bufs[n_bufs].ptr    = dst->data;
			
 
				-    bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
			
 
				-    bufs[n_bufs].size   = ggml_nbytes(dst);
			
 
				-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
			
 
				-    ++n_bufs;
			
 
				+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
			
 
				 
			
 
				     // Primary DSP session from the src0 tensor
			
 
				-    ggml_hexagon_session * sess = src0_buf->sess;
			
 
				+    auto * sess = get_session_from_tensor(src0);
			
 
				 
			
 
				     if (opt_verbose) {
			
 
				-        char dims[64 * GGML_MAX_SRC];
			
 
				-        char strides[64 * GGML_MAX_SRC];
			
 
				-        char types[16 * GGML_MAX_SRC];
			
 
				-        char buffs[64 * GGML_MAX_SRC];
			
 
				-        char names[64 * GGML_MAX_SRC];
			
 
				-
			
 
				-        hex_format_op_dims(dims, op);
			
 
				-        hex_format_op_strides(strides, op);
			
 
				-        hex_format_op_types(types, op);
			
 
				-        hex_format_op_buffs(buffs, op);
			
 
				-        hex_format_op_names(names, op);
			
 
				-
			
 
				-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
			
 
				-                    names, dims, types, strides, buffs, req.flags);
			
 
				+        hex_print_op_info(op, sess, req.flags);
			
 
				         if (opt_verbose > 1) {
			
 
				             hex_dump_dspbuf(src0, &bufs[0]);
			
 
				             if (src1) {
			
@@ -3023,85 +2802,40 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
 
				     }
			
 
				 
			
 
				     dspqueue_buffer bufs[4];
			
 
				-    int             n_bufs = 0;
			
 
				-
			
 
				-    memset(bufs, 0, sizeof(bufs));
			
 
				 
			
 
				     // First buffer
			
 
				     // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				     // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				     // with I/O coherency support the framework will automatically skip
			
 
				     // cache operations where possible.
			
 
				-    auto src0_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
			
 
				-    bufs[n_bufs].fd     = src0_buf->fd;
			
 
				-    bufs[n_bufs].ptr    = src0->data;
			
 
				-    bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
			
 
				-    bufs[n_bufs].size   = ggml_nbytes(src0);
			
 
				-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
			
 
				-    ++n_bufs;
			
 
				+    size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
			
 
				 
			
 
				     // Second buffer
			
 
				     // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				     // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				     // with I/O coherency support the framework will automatically skip
			
 
				     // cache operations where possible.
			
 
				-    auto src1_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
			
 
				-    bufs[n_bufs].fd     = src1_buf->fd;
			
 
				-    bufs[n_bufs].ptr    = src1->data;
			
 
				-    bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
			
 
				-    bufs[n_bufs].size   = ggml_nbytes(src1);
			
 
				-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				-    ++n_bufs;
			
 
				+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
			
 
				 
			
 
				-    if (src2) {
			
 
				-        // Third buffer
			
 
				-        // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				-        // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				-        // with I/O coherency support the framework will automatically skip
			
 
				-        // cache operations where possible.
			
 
				-        auto src2_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
			
 
				-        bufs[n_bufs].fd     = src2_buf->fd;
			
 
				-        bufs[n_bufs].ptr    = src2->data;
			
 
				-        bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
			
 
				-        bufs[n_bufs].size   = ggml_nbytes(src2);
			
 
				-        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
			
 
				-                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
			
 
				-        ++n_bufs;
			
 
				-    }
			
 
				+    // Third buffer(nullable)
			
 
				+    // This is a buffer that the CPU writes and the DSP reads, so we'll
			
 
				+    // need to flush CPU caches and invalidate DSP ones. On platforms
			
 
				+    // with I/O coherency support the framework will automatically skip
			
 
				+    // cache operations where possible.
			
 
				+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
			
 
				 
			
 
				     // Final buffer = Output Activations. We'll handle DSP
			
 
				     // Second buffer = Output Activations. We'll handle DSP
			
 
				     // cache maintenance in the response message but need to flush
			
 
				     // CPU caches to ensure any previously written dirty lines are
			
 
				     // written out before writes from the DSP start.
			
 
				-    auto dst_buf        = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
			
 
				-    bufs[n_bufs].fd     = dst_buf->fd;
			
 
				-    bufs[n_bufs].ptr    = dst->data;
			
 
				-    bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
			
 
				-    bufs[n_bufs].size   = ggml_nbytes(dst);
			
 
				-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
			
 
				-    ++n_bufs;
			
 
				+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
			
 
				 
			
 
				     // Primary DSP session from the src0 tensor
			
 
				-    ggml_hexagon_session * sess = src0_buf->sess;
			
 
				+    auto * sess = get_session_from_tensor(src0);
			
 
				 
			
 
				     if (opt_verbose) {
			
 
				-        char dims[64 * GGML_MAX_SRC];
			
 
				-        char strides[64 * GGML_MAX_SRC];
			
 
				-        char types[16 * GGML_MAX_SRC];
			
 
				-        char buffs[64 * GGML_MAX_SRC];
			
 
				-        char names[64 * GGML_MAX_SRC];
			
 
				-
			
 
				-        hex_format_op_dims(dims, op);
			
 
				-        hex_format_op_strides(strides, op);
			
 
				-        hex_format_op_types(types, op);
			
 
				-        hex_format_op_buffs(buffs, op);
			
 
				-        hex_format_op_names(names, op);
			
 
				-
			
 
				-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
			
 
				-                    names, dims, types, strides, buffs, req.flags);
			
 
				+        hex_print_op_info(op, sess, req.flags);
			
 
				         if (opt_verbose > 1) {
			
 
				             hex_dump_dspbuf(src0, &bufs[0]);
			
 
				             if (src1) {
			
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -16,13 +16,8 @@
 
				 #include "hvx-utils.h"
			
 
				 #include "ops-utils.h"
			
 
				 
			
 
				-static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
			
 
				-    static const float kInf    = INFINITY;
			
 
				-    static const float kMaxExp = 88.02f;  // log(INF)
			
 
				-
			
 
				-    const HVX_Vector     max_exp = hvx_vec_splat_fp32(kMaxExp);
			
 
				-    const HVX_Vector     inf     = hvx_vec_splat_fp32(kInf);
			
 
				-    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
			
 
				+static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
			
 
				+    const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
			
 
				 
			
 
				     HVX_Vector out = hvx_vec_exp_fp32(in_vec);
			
 
				 
			
@@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
 
				 
			
 
				     HVX_Vector vec_out = Q6_V_vzero();
			
 
				 
			
 
				+    static const float kInf    = INFINITY;
			
 
				+    static const float kMaxExp = 88.02f;  // log(INF)
			
 
				+
			
 
				+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
			
 
				+    const HVX_Vector inf     = hvx_vec_splat_fp32(kInf);
			
 
				+
			
 
				     if (0 == unaligned_loop) {
			
 
				         HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
			
 
				         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
			
@@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
 
				         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
			
 
				             if (true == negate) {
			
 
				                 HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
			
 
				-                *p_vec_out++          = hvx_vec_exp_fp32_guard(neg_vec_in);
			
 
				+                *p_vec_out++          = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
			
 
				             } else {
			
 
				-                *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
			
 
				+                *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
			
 
				             }
			
 
				         }
			
 
				     } else {
			
@@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
 
				 
			
 
				             if (true == negate) {
			
 
				                 HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
			
 
				-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
			
 
				+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
			
 
				             } else {
			
 
				-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
			
 
				+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
			
 
				             }
			
 
				         }
			
 
				     }
			
@@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
 
				         if (true == negate) {
			
 
				             HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
			
 
				 
			
 
				-            vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
			
 
				+            vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
			
 
				         } else {
			
 
				-            vec_out = hvx_vec_exp_fp32_guard(in);
			
 
				+            vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
			
 
				         }
			
 
				 
			
 
				         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
			
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -16,6 +16,15 @@
 
				 #include "hvx-utils.h"
			
 
				 #include "ops-utils.h"
			
 
				 
			
 
				+static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
			
 
				+    HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
			
 
				+
			
 
				+    HVX_Vector           masked_out = Q6_V_vand_VV(out, nan_inf_mask);
			
 
				+    const HVX_VectorPred pred       = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
			
 
				+
			
 
				+    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
			
 
				+}
			
 
				+
			
 
				 void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
			
 
				     int left_over       = num_elems & (VLEN_FP32 - 1);
			
 
				     int num_elems_whole = num_elems - left_over;
			
@@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
 
				         FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
			
 
				     }
			
 
				 
			
 
				+    static const uint32_t kNanInfMask  = 0x7f800000;
			
 
				+    const HVX_Vector      nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
			
 
				+
			
 
				     if (0 == unaligned_loop) {
			
 
				         HVX_Vector * p_vec_in  = (HVX_Vector *) src;
			
 
				         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
			
 
				 
			
 
				         #pragma unroll(4)
			
 
				         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
			
 
				-            *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
			
 
				+            *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
			
 
				         }
			
 
				     } else {
			
 
				         #pragma unroll(4)
			
 
				         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
			
 
				             HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
			
 
				-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
			
 
				+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
 
				         float *       dstf = (float *) dst + num_elems_whole;
			
 
				 
			
 
				         HVX_Vector in  = *(HVX_UVector *) srcf;
			
 
				-        HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
			
 
				+        HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
			
 
				 
			
 
				         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
			
 
				     }
			
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -726,24 +726,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
 
				     return Q6_Vsf_equals_Vqf32(r_qf);
			
 
				 }
			
 
				 
			
 
				-static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
			
 
				-    static const float    kInf     = INFINITY;
			
 
				-    static const uint32_t kNanMask = 0x7fffffff;
			
 
				-    static const uint32_t kNanMin  = 0x7f800000;
			
 
				-
			
 
				-    const HVX_Vector     inf      = hvx_vec_splat_fp32(kInf);
			
 
				-    const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
			
 
				-
			
 
				-    HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
			
 
				-
			
 
				-    const HVX_Vector     nan_mask   = Q6_V_vsplat_R(kNanMask);
			
 
				-    const HVX_Vector     nan_min    = Q6_V_vsplat_R(kNanMin);
			
 
				-    HVX_Vector           masked_out = Q6_V_vand_VV(out, nan_mask);
			
 
				-    const HVX_VectorPred pred       = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
			
 
				-
			
 
				-    return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
			
 
				-}
			
 
				-
			
 
				 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
			
 
				 #define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
			
 
				 #define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267
			
@@ -958,14 +940,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
 
				     return Q6_Vsf_equals_Vqf32(temp);
			
 
				 }
			
 
				 
			
 
				-static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
			
 
				-    static const float kMaxExp = -88.02f;  // log(INF)
			
 
				-
			
 
				-    const HVX_Vector     max_exp  = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
			
 
				-    const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
			
 
				+static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
			
 
				+                                                         HVX_Vector one,
			
 
				+                                                         HVX_Vector max_exp,
			
 
				+                                                         HVX_Vector min_exp) {
			
 
				+    const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
			
 
				+    const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
			
 
				 
			
 
				     HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
			
 
				-    return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
			
 
				+    out            = Q6_V_vmux_QVV(pred_max, out, one);
			
 
				+    return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
			
 
				 }
			
 
				 
			
 
				 static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
			
@@ -977,9 +961,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
 
				     const HVX_Vector * restrict v_src = (HVX_Vector *) src;
			
 
				     HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
			
 
				 
			
 
				+    static const float kMinExp = -87.f;  // 0
			
 
				+    static const float kMaxExp = 87.f;   // 1
			
 
				+
			
 
				+    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
			
 
				+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
			
 
				+    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
			
 
				+
			
 
				     #pragma unroll(4)
			
 
				     for (int i = 0; i < step_of_1; i++) {
			
 
				-        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
			
 
				+        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
			
 
				     }
			
 
				 }