пре 3 месеци · 7eef0bd948
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -544,6 +544,7 @@ extern "C" {
 
															         GGML_OP_GATED_LINEAR_ATTN,
														
 
															         GGML_OP_RWKV_WKV7,
														
 
															         GGML_OP_DELTA_NET,
														
 
															+        GGML_OP_DELTA_NET_RECURRENT,
														
 
															         GGML_OP_UNARY,
														
@@ -578,6 +579,8 @@ extern "C" {
 
															         GGML_UNARY_OP_HARDSWISH,
														
 
															         GGML_UNARY_OP_HARDSIGMOID,
														
 
															         GGML_UNARY_OP_EXP,
														
 
															+        GGML_UNARY_OP_EXPM1,
														
 
															+        GGML_UNARY_OP_SOFTPLUS,
														
 
															         GGML_UNARY_OP_GELU_ERF,
														
 
															         GGML_UNARY_OP_COUNT,
														
@@ -961,6 +964,22 @@ extern "C" {
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * a);
														
 
															+    GGML_API struct ggml_tensor * ggml_expm1(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_expm1_inplace(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_softplus(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_softplus_inplace(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															     GGML_API struct ggml_tensor * ggml_sin(
														
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * a);
														
@@ -1164,6 +1183,22 @@ extern "C" {
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * a);
														
 
															+    GGML_API struct ggml_tensor * ggml_expm1(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_expm1_inplace(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_softplus(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_softplus_inplace(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															     // gated linear unit ops
														
 
															     // A: n columns, r rows,
														
 
															     // result is n / 2 columns, r rows,
														
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2010,6 +2010,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
															             {
														
 
															                 ggml_compute_forward_delta_net_f32(params, tensor);
														
 
															             } break;
														
 
															+        case GGML_OP_DELTA_NET_RECURRENT:
														
 
															+            {
														
 
															+                ggml_compute_forward_delta_net_recurrent_f32(params, tensor);
														
 
															+            } break;
														
 
															         case GGML_OP_MAP_CUSTOM1:
														
 
															             {
														
 
															                 ggml_compute_forward_map_custom1(params, tensor);
														
@@ -2193,6 +2197,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
															                 case GGML_UNARY_OP_HARDSWISH:
														
 
															                 case GGML_UNARY_OP_HARDSIGMOID:
														
 
															                 case GGML_UNARY_OP_EXP:
														
 
															+                case GGML_UNARY_OP_SOFTPLUS:
														
 
															+                case GGML_UNARY_OP_EXPM1:
														
 
															                     {
														
 
															                         n_tasks = 1;
														
 
															                     } break;
														
@@ -2288,6 +2294,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
															         case GGML_OP_POOL_1D:
														
 
															         case GGML_OP_POOL_2D:
														
 
															         case GGML_OP_POOL_2D_BACK:
														
 
															+        case GGML_OP_DELTA_NET_RECURRENT:
														
 
															             {
														
 
															                 n_tasks = 1;
														
 
															             } break;
														
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -9861,6 +9861,14 @@ void ggml_compute_forward_unary(
 
															             {
														
 
															                 ggml_compute_forward_exp(params, dst);
														
 
															             } break;
														
 
															+        case GGML_UNARY_OP_EXPM1:
														
 
															+            {
														
 
															+                ggml_compute_forward_expm1(params, dst);
														
 
															+            } break;
														
 
															+        case GGML_UNARY_OP_SOFTPLUS:
														
 
															+            {
														
 
															+                ggml_compute_forward_softplus(params, dst);
														
 
															+            } break;
														
 
															         default:
														
 
															             {
														
 
															                 GGML_ABORT("fatal error");
														
@@ -10874,6 +10882,200 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
 
															     }    
														
 
															 }
														
 
															+static void print_debug_info(float * data, size_t size, const char * name, int64_t token) {
														
 
															+    GGML_LOG_INFO("\nggml-debug: %s (%ld) first 5 values: [%.6f, %.6f, %.6f, %.6f, %.6f, ...]\n", 
														
 
															+        name, token, data[0], data[1], data[2], data[3], data[4]);
														
 
															+    double sum = 0.0;
														
 
															+    for (unsigned int i = 0; i < size; i++) {
														
 
															+        sum += data[i];
														
 
															+    }
														
 
															+    GGML_LOG_INFO("sum = %.10f\n", sum);
														
 
															+}
														
 
															+
														
 
															+void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    const struct ggml_tensor * src0 = dst->src[0];  // q_tokens
														
 
															+    const struct ggml_tensor * src1 = dst->src[1];  // k_tokens
														
 
															+    const struct ggml_tensor * src2 = dst->src[2];  // v_tokens
														
 
															+    const struct ggml_tensor * src3 = dst->src[3];  // g_tokens_exp
														
 
															+    const struct ggml_tensor * src4 = dst->src[4];  // beta_tokens
														
 
															+    const struct ggml_tensor * src5 = dst->src[5];  // state
														
 
															+    // src6, src7, src8 are nullptr in recurrent version
														
 
															+
														
 
															+    const int64_t H_v               = (int64_t) dst->op_params[0];
														
 
															+    const int64_t S_k               = (int64_t) dst->op_params[1];
														
 
															+    const int64_t S_v               = (int64_t) dst->op_params[2];
														
 
															+    const int64_t original_n_tokens = (int64_t) dst->op_params[3];  // Get original sequence length
														
 
															+    const int64_t n_tokens          = original_n_tokens;            // Use the original sequence length
														
 
															+    const int64_t n_seqs            = src0->ne[3];                  // q tensor has n_seqs in dim 3
														
 
															+
														
 
															+    // Add assertions to verify tensor dimensions
														
 
															+    GGML_ASSERT(src0->ne[3] == n_seqs);  // q tensor
														
 
															+    GGML_ASSERT(src1->ne[3] == n_seqs);  // k tensor
														
 
															+    GGML_ASSERT(src2->ne[3] == n_seqs);  // v tensor
														
 
															+    GGML_ASSERT(src3->ne[3] == n_seqs);  // g tensor
														
 
															+    GGML_ASSERT(src4->ne[3] == n_seqs);  // beta tensor
														
 
															+    GGML_ASSERT(src5->ne[3] == n_seqs);  // state tensor
														
 
															+
														
 
															+    float * dst_data  = (float *) dst->data;
														
 
															+    // Output is first part, state is second part
														
 
															+    float * output    = dst_data; // [S_v * H_v * n_tokens * n_seqs]
														
 
															+    float * final_state = dst_data + (S_v * H_v * n_tokens * n_seqs);  // [S_v * S_v * H_v * n_seqs]
														
 
															+
														
 
															+    const int ith = params->ith;
														
 
															+    // const int nth = params->nth;
														
 
															+
														
 
															+    // Clear output and new state section
														
 
															+    if (ith == 0) {
														
 
															+        memset(output, 0, ((S_v * H_v * n_tokens * n_seqs) + (S_v * S_v * H_v * n_seqs)) * sizeof(float));
														
 
															+    } else {
														
 
															+        return; // only calculate on one thread
														
 
															+    }
														
 
															+
														
 
															+    float * state_data = (float *) src5->data; // state is now src5
														
 
															+
														
 
															+    GGML_ASSERT(ggml_is_contiguous(src0));
														
 
															+    GGML_ASSERT(ggml_is_contiguous(src1));
														
 
															+    GGML_ASSERT(ggml_is_contiguous(src2));
														
 
															+    GGML_ASSERT(ggml_is_contiguous(src3));
														
 
															+    GGML_ASSERT(ggml_is_contiguous(src4));
														
 
															+    GGML_ASSERT(ggml_is_contiguous(src5));
														
 
															+
														
 
															+    const auto state_ptr = [state_data, src5] (int64_t seq, int64_t head, int64_t i, int64_t j) {
														
 
															+        return state_data + (j * src5->nb[0] / sizeof(float)) + (i * src5->nb[1] / sizeof(float)) + 
														
 
															+            (head * src5->nb[2] / sizeof(float)) + (seq * src5->nb[3] / sizeof(float));
														
 
															+    };
														
 
															+
														
 
															+    // Process each token sequentially across all sequences and heads (recurrent processing)
														
 
															+    // Following the PyTorch reference: for each token i, process all sequences and heads
														
 
															+    for (int64_t token = 0; token < n_tokens; token++) {
														
 
															+        const auto q_t = [token, src0] (int64_t seq, int64_t head, int64_t i) { return ggml_get_f32_nd(src0, token, i, head, seq); };
														
 
															+        const auto k_t = [token, src1] (int64_t seq, int64_t head, int64_t i) { return ggml_get_f32_nd(src1, token, i, head, seq); };
														
 
															+        const auto v_t = [token, src2] (int64_t seq, int64_t head, int64_t i) { return ggml_get_f32_nd(src2, token, i, head, seq); };
														
 
															+        const auto g_exp_t = [token, src3] (int64_t seq, int64_t head) { return ggml_get_f32_nd(src3, token, 0, head, seq); };
														
 
															+        const auto beta_t = [token, src4] (int64_t seq, int64_t head) { return ggml_get_f32_nd(src4, token, 0, head, seq); };
														
 
															+        
														
 
															+        float * delta = (float *)malloc(S_v * H_v * n_seqs * sizeof(float));
														
 
															+        float * kv_mem = (float *)malloc(S_v * H_v * n_seqs * sizeof(float));
														
 
															+        float * attn_out_t = (float *)malloc(S_v * H_v * n_seqs * sizeof(float));
														
 
															+        
														
 
															+        // Create temporary arrays for processing all sequences and heads at once
														
 
															+        float * temp_state = (float *) malloc(S_v * S_v * H_v * n_seqs * sizeof(float));
														
 
															+        
														
 
															+        // Initialize temp_state with current state values for all sequences and heads
														
 
															+        for (int64_t seq = 0; seq < n_seqs; seq++) {
														
 
															+            for (int64_t head = 0; head < H_v; head++) {
														
 
															+                for (int64_t i = 0; i < S_v; i++) {
														
 
															+                    for (int64_t j = 0; j < S_v; j++) {
														
 
															+                        int64_t idx = seq * (S_v * S_v * H_v) + head * (S_v * S_v) + i * S_v + j;
														
 
															+                        temp_state[idx] = *(state_ptr(seq, head, i, j));
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_copy", token);
														
 
															+
														
 
															+        // 1. last_recurrent_state = last_recurrent_state * g_t (for all seqs and heads)
														
 
															+        for (int64_t seq = 0; seq < n_seqs; seq++) {
														
 
															+            for (int64_t head = 0; head < H_v; head++) {
														
 
															+                float g_exp = g_exp_t(seq, head);
														
 
															+                for (int64_t i = 0; i < S_v; i++) {
														
 
															+                    for (int64_t j = 0; j < S_v; j++) {
														
 
															+                        int64_t idx = seq * (S_v * S_v * H_v) + head * (S_v * S_v) + i * S_v + j;
														
 
															+                        temp_state[idx] *= g_exp;
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_times_g_t", token);
														
 
															+        
														
 
															+        // 2. kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) (for all seqs and heads)
														
 
															+        for (int64_t seq = 0; seq < n_seqs; seq++) {
														
 
															+            for (int64_t head = 0; head < H_v; head++) {
														
 
															+                for (int64_t j = 0; j < S_v; j++) {
														
 
															+                    kv_mem[seq * H_v * S_v + head * S_v + j] = 0.0f;
														
 
															+                    for (int64_t i = 0; i < S_v; i++) {
														
 
															+                        int64_t state_idx = seq * (S_v * S_v * H_v) + head * (S_v * S_v) + i * S_v + j;
														
 
															+                        // This implements: (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
														
 
															+                        kv_mem[seq * H_v * S_v + head * S_v + j] += temp_state[state_idx] * k_t(seq, head, i);
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        print_debug_info(kv_mem, n_seqs * H_v * S_v, "kv_mem", token);
														
 
															+        
														
 
															+        // 3. delta = (v_t - kv_mem) * beta_t (for all seqs and heads)
														
 
															+        for (int64_t seq = 0; seq < n_seqs; seq++) {
														
 
															+            for (int64_t head = 0; head < H_v; head++) {
														
 
															+                float beta_val = beta_t(seq, head);
														
 
															+                for (int64_t j = 0; j < S_v; j++) {
														
 
															+                    delta[seq * H_v * S_v + head * S_v + j] =
														
 
															+                        (v_t(seq, head, j) - kv_mem[seq * H_v * S_v + head * S_v + j]) * beta_val;
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        print_debug_info(delta, n_seqs * H_v * S_v, "delta", token);
														
 
															+        
														
 
															+        // 4. last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2) (for all seqs and heads)
														
 
															+        for (int64_t seq = 0; seq < n_seqs; seq++) {
														
 
															+            for (int64_t head = 0; head < H_v; head++) {
														
 
															+                for (int64_t i = 0; i < S_v; i++) {
														
 
															+                    for (int64_t j = 0; j < S_v; j++) {
														
 
															+                        int64_t state_idx = seq * (S_v * S_v * H_v) + head * (S_v * S_v) + i * S_v + j;
														
 
															+                        // k_t[i] * delta[j] (where delta is treated as column vector)
														
 
															+                        temp_state[state_idx] += k_t(seq, head, i) * delta[seq * H_v * S_v + head * S_v + j];
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state", token);
														
 
															+        
														
 
															+        // 5. core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) (for all seqs and heads)
														
 
															+        for (int64_t seq = 0; seq < n_seqs; seq++) {
														
 
															+            for (int64_t head = 0; head < H_v; head++) {
														
 
															+                for (int64_t j = 0; j < S_v; j++) {
														
 
															+                    attn_out_t[seq * H_v * S_v + head * S_v + j] = 0.0f;
														
 
															+                    for (int64_t i = 0; i < S_v; i++) {
														
 
															+                        int64_t state_idx = seq * (S_v * S_v * H_v) + head * (S_v * S_v) + i * S_v + j;
														
 
															+                        attn_out_t[seq * H_v * S_v + head * S_v + j] += temp_state[state_idx] * q_t(seq, head, i);
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        print_debug_info(attn_out_t, n_seqs * S_v * H_v, "attn_out_t", token);
														
 
															+        
														
 
															+        // Store the output for this token (for all seqs and heads)
														
 
															+        for (int64_t seq = 0; seq < n_seqs; seq++) {
														
 
															+            for (int64_t head = 0; head < H_v; head++) {
														
 
															+                for (int64_t d = 0; d < S_v; d++) {
														
 
															+                    int64_t output_idx = d + head * S_v + token * (S_v * H_v) + seq * (S_v * H_v * n_tokens);
														
 
															+                    output[output_idx] = attn_out_t[seq * H_v * S_v + head * S_v + d];
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        
														
 
															+        // Update the working state for next token iteration (in the state tensor for all seqs and heads)
														
 
															+        for (int64_t seq = 0; seq < n_seqs; seq++) {
														
 
															+            for (int64_t head = 0; head < H_v; head++) {
														
 
															+                for (int64_t i = 0; i < S_v; i++) {
														
 
															+                    for (int64_t j = 0; j < S_v; j++) {
														
 
															+                        int64_t state_idx = seq * (S_v * S_v * H_v) + head * (S_v * S_v) + i * S_v + j;
														
 
															+                        *(state_ptr(seq, head, i, j)) = temp_state[state_idx];
														
 
															+                        
														
 
															+                        // Store the final state for this head and sequence (for output)
														
 
															+                        int64_t final_state_idx = i + j * S_v + head * (S_v * S_v) + seq * (S_v * S_v * H_v);
														
 
															+                        final_state[final_state_idx] = temp_state[state_idx];
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        
														
 
															+        free(temp_state);
														
 
															+        free(delta);
														
 
															+        free(kv_mem);
														
 
															+        free(attn_out_t);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 // ggml_compute_forward_rwkv_wkv7
														
 
															 static void ggml_compute_forward_rwkv_wkv7_f32(
														
 
															         const ggml_compute_params * params,
														
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -103,6 +103,7 @@ void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params,
 
															 void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_delta_net_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_delta_net_recurrent_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
--- a/ggml/src/ggml-cpu/unary-ops.cpp
+++ b/ggml/src/ggml-cpu/unary-ops.cpp
@@ -64,6 +64,14 @@ static inline float op_log(float x) {
 
															     return logf(x);
														
 
															 }
														
 
															+static inline float op_expm1(float x) {
														
 
															+    return expf(x) - 1.0f;
														
 
															+}
														
 
															+
														
 
															+static inline float op_softplus(float x) {
														
 
															+    return (x > 20.0f) ? x : logf(1.0f + expf(x));
														
 
															+}
														
 
															+
														
 
															 template <float (*op)(float), typename src0_t, typename dst_t>
														
 
															 static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
														
 
															     constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
														
@@ -184,3 +192,11 @@ void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor *
 
															 void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															     unary_op<op_log>(params, dst);
														
 
															 }
														
 
															+
														
 
															+void ggml_compute_forward_expm1(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    unary_op<op_expm1>(params, dst);
														
 
															+}
														
 
															+
														
 
															+void ggml_compute_forward_softplus(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    unary_op<op_softplus>(params, dst);
														
 
															+}
														
--- a/ggml/src/ggml-cpu/unary-ops.h
+++ b/ggml/src/ggml-cpu/unary-ops.h
@@ -22,6 +22,8 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
 
															 void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_expm1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_softplus(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 #ifdef __cplusplus
														
 
															 }
														
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2333,6 +2333,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
															                 case GGML_UNARY_OP_ELU:
														
 
															                     ggml_cuda_op_elu(ctx, dst);
														
 
															                     break;
														
 
															+                case GGML_UNARY_OP_EXPM1:
														
 
															+                    ggml_cuda_op_expm1(ctx, dst);
														
 
															+                    break;
														
 
															+                case GGML_UNARY_OP_SOFTPLUS:
														
 
															+                    ggml_cuda_op_softplus(ctx, dst);
														
 
															+                    break;
														
 
															                 default:
														
 
															                     return false;
														
 
															             }
														
@@ -3314,6 +3320,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 
															                 case GGML_UNARY_OP_GELU_QUICK:
														
 
															                 case GGML_UNARY_OP_TANH:
														
 
															                 case GGML_UNARY_OP_EXP:
														
 
															+                case GGML_UNARY_OP_EXPM1:
														
 
															+                case GGML_UNARY_OP_SOFTPLUS:
														
 
															                 case GGML_UNARY_OP_ELU:
														
 
															                     return ggml_is_contiguous(op->src[0]);
														
 
															                 default:
														
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -83,6 +83,14 @@ static __device__ __forceinline__ float op_log(float x) {
 
															     return logf(x);
														
 
															 }
														
 
															+static __device__ __forceinline__ float op_expm1(float x) {
														
 
															+    return expf(x) - 1.0f;
														
 
															+}
														
 
															+
														
 
															+static __device__ __forceinline__ float op_softplus(float x) {
														
 
															+    return (x > 20.0f) ? x : logf(1.0f + expf(x));
														
 
															+}
														
 
															+
														
 
															 static __device__ __forceinline__ float op_elu(float x) {
														
 
															     return (x > 0.f) ? x : expm1f(x);
														
 
															 }
														
@@ -203,6 +211,14 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
															 void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															     ggml_cuda_op_unary<op_elu>(ctx, dst);
														
 
															 }
														
 
															+
														
 
															+void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															+    ggml_cuda_op_unary<op_expm1>(ctx, dst);
														
 
															+}
														
 
															+
														
 
															+void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															+    ggml_cuda_op_unary<op_softplus>(ctx, dst);
														
 
															+}
														
 
															 /* gated ops */
														
 
															 template <float (*op)(float), typename T>
														
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -59,6 +59,10 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
															 void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+
														
 
															+void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+
														
 
															 void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															 void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1005,6 +1005,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
															     "GATED_LINEAR_ATTN",
														
 
															     "RWKV_WKV7",
														
 
															     "DELTA_NET",
														
 
															+    "DELTA_NET_RECURRENT",
														
 
															     "UNARY",
														
@@ -1022,7 +1023,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
															     "GLU",
														
 
															 };
														
 
															-static_assert(GGML_OP_COUNT == 93, "GGML_OP_COUNT != 93");
														
 
															+static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94");
														
 
															 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
														
 
															     "none",
														
@@ -1112,6 +1113,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
															     "gated_linear_attn(k, v, q, gate, s)",
														
 
															     "rwkv_wkv7(r, w, k, v, a, b, s)",
														
 
															     "delta_net(q, k, v, g, beta, state)",
														
 
															+    "delta_net_recurrent(q, k, v, g, beta, state)",
														
 
															     "unary(x)",
														
@@ -1129,7 +1131,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
															     "glu(x)",
														
 
															 };
														
 
															-static_assert(GGML_OP_COUNT == 93, "GGML_OP_COUNT != 93");
														
 
															+static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94");
														
 
															 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
														
@@ -1148,10 +1150,12 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
 
															     "HARDSWISH",
														
 
															     "HARDSIGMOID",
														
 
															     "EXP",
														
 
															+    "EXPM1",
														
 
															+    "SOFTPLUS",
														
 
															     "GELU_ERF",
														
 
															 };
														
 
															-static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
														
 
															+static_assert(GGML_UNARY_OP_COUNT == 17, "GGML_UNARY_OP_COUNT != 17");
														
 
															 static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
														
@@ -2260,6 +2264,30 @@ struct ggml_tensor * ggml_log_inplace(
 
															     return ggml_log_impl(ctx, a, true);
														
 
															 }
														
 
															+struct ggml_tensor * ggml_expm1(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    return ggml_unary(ctx, a, GGML_UNARY_OP_EXPM1);
														
 
															+}
														
 
															+
														
 
															+struct ggml_tensor * ggml_expm1_inplace(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXPM1);
														
 
															+}
														
 
															+
														
 
															+struct ggml_tensor * ggml_softplus(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    return ggml_unary(ctx, a, GGML_UNARY_OP_SOFTPLUS);
														
 
															+}
														
 
															+
														
 
															+struct ggml_tensor * ggml_softplus_inplace(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SOFTPLUS);
														
 
															+}
														
 
															+
														
 
															 // ggml_sin
														
 
															 static struct ggml_tensor * ggml_sin_impl(
														
@@ -6402,16 +6430,41 @@ static void ggml_compute_backward(
 
															                         ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, grad, src0));
														
 
															                     }
														
 
															                 } break;
														
 
															-                case GGML_UNARY_OP_EXP: {
														
 
															-                    if (src0_needs_grads) {
														
 
															-                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad));
														
 
															+                case GGML_UNARY_OP_EXP:
														
 
															+                    {
														
 
															+                        if (src0_needs_grads) {
														
 
															+                            ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad));
														
 
															+                        }
														
 
															                     }
														
 
															-                } break;
														
 
															-                default: {
														
 
															-                    fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n",
														
 
															-                        __func__, ggml_unary_op_name(ggml_get_unary_op(tensor)));
														
 
															-                    GGML_ABORT("fatal error");
														
 
															-                } //break;
														
 
															+                    break;
														
 
															+                case GGML_UNARY_OP_EXPM1:
														
 
															+                    {
														
 
															+                        if (src0_needs_grads) {
														
 
															+                            ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_exp(ctx, src0)));
														
 
															+                        }
														
 
															+                    }
														
 
															+                    break;
														
 
															+                case GGML_UNARY_OP_SOFTPLUS:
														
 
															+                    {
														
 
															+                        if (src0_needs_grads) {
														
 
															+                            // gradient of softplus: sigmoid(x) = 1 / (1 + exp(-x))
														
 
															+                            struct ggml_tensor * neg_src0 = ggml_neg(ctx, src0);
														
 
															+                            struct ggml_tensor * exp_neg  = ggml_exp(ctx, neg_src0);
														
 
															+                            struct ggml_tensor * ones =
														
 
															+                                ggml_exp(ctx, ggml_new_tensor_4d(ctx, src0->type, src0->ne[0], src0->ne[1], src0->ne[2],
														
 
															+                                                                 src0->ne[3]));
														
 
															+                            struct ggml_tensor * one_plus_exp = ggml_add(ctx, ones, exp_neg);
														
 
															+                            struct ggml_tensor * sigmoid      = ggml_div(ctx, ones, one_plus_exp);
														
 
															+                            ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, sigmoid));
														
 
															+                        }
														
 
															+                    }
														
 
															+                    break;
														
 
															+                default:
														
 
															+                    {
														
 
															+                        fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n", __func__,
														
 
															+                                ggml_unary_op_name(ggml_get_unary_op(tensor)));
														
 
															+                        GGML_ABORT("fatal error");
														
 
															+                    }  //break;
														
 
															             }
														
 
															         } break;
														
 
															         case GGML_OP_CROSS_ENTROPY_LOSS: {
														
--- a/src/models/llm_build_qwen3next.cpp
+++ b/src/models/llm_build_qwen3next.cpp
@@ -361,8 +361,8 @@ struct ggml_tensor * llm_build_qwen3next::delta_net(
 
															     cb(attn, "attn_in", il);
														
 
															     // We'll be returning the result as a 1D tensor due to the dimensions mismatch of the state and output tensors
														
 
															-    const int64_t ne[1] = { (S_v * H_v * n_tokens * n_seqs ) + (S_v * S_v * H_v * n_seqs) };
														
 
															-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 1, ne);
														
 
															+    const int64_t total_dims = (S_v * H_v * n_tokens * n_seqs) + (S_v * S_v * H_v * n_seqs);
														
 
															+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, total_dims);
														
 
															     ggml_set_op_params_i32(result, 0, H_v);
														
 
															     ggml_set_op_params_i32(result, 1, S_k);
														
@@ -384,7 +384,6 @@ struct ggml_tensor * llm_build_qwen3next::delta_net(
 
															 }
														
 
															 // delta_net_recurrent
														
 
															-// Recurrent version of delta_net for sequence_length = 1
														
 
															 struct ggml_tensor * llm_build_qwen3next::delta_net_recurrent(
														
 
															         struct ggml_context * ctx,
														
 
															         struct ggml_tensor  * q,
														
@@ -467,79 +466,33 @@ struct ggml_tensor * llm_build_qwen3next::delta_net_recurrent(
 
															     state = ggml_cont_4d(ctx, state, S_v, S_v, H_k, n_seqs);
														
 
															     ggml_tensor * g_tokens_exp = ggml_exp(ctx, g_tokens);
														
 
															-    ggml_tensor * final_output = nullptr;
														
 
															-    ggml_tensor * q_t, * k_t, * v_t, * g_t_exp, * beta_t;
														
 
															-    for (int i = 0; i < n_tokens; i++) { // this part is per token
														
 
															-        if (n_tokens == 1) { // don't do unnecessary reshapes / views
														
 
															-            q_t = q_tokens;
														
 
															-            k_t = k_tokens;
														
 
															-            v_t = v_tokens;
														
 
															-            g_t_exp = g_tokens_exp;
														
 
															-            beta_t = beta_tokens;
														
 
															-        } else {
														
 
															-            q_t = ggml_view_4d(ctx, q_tokens, 1, S_k, H_k, n_seqs, q_tokens->nb[1], q_tokens->nb[2], q_tokens->nb[3], i * ggml_element_size(q_tokens));
														
 
															-            k_t = ggml_view_4d(ctx, k_tokens, 1, S_k, H_k, n_seqs, k_tokens->nb[1], k_tokens->nb[2], k_tokens->nb[3], i * ggml_element_size(k_tokens));
														
 
															-            v_t = ggml_view_4d(ctx, v_tokens, 1, S_v, H_k, n_seqs, v_tokens->nb[1], v_tokens->nb[2], v_tokens->nb[3], i * ggml_element_size(v_tokens));
														
 
															-            g_t_exp = ggml_view_4d(ctx, g_tokens_exp, 1, 1, H_k, n_seqs, g_tokens_exp->nb[1], g_tokens_exp->nb[2], g_tokens_exp->nb[3], i * ggml_element_size(g_tokens_exp));
														
 
															-            beta_t = ggml_view_4d(ctx, beta_tokens, 1, 1, H_k, n_seqs, beta_tokens->nb[1], beta_tokens->nb[2], beta_tokens->nb[3], i * ggml_element_size(beta_tokens));
														
 
															-        }
														
 
															-
														
 
															-        // Apply gate to state: state = state * exp(g)
														
 
															-        ggml_tensor * gated_state = ggml_mul(ctx, state, g_t_exp);
														
 
															-        cb(gated_state, "gated_state", il);
														
 
															-
														
 
															-        // Compute kv_memory from state and key
														
 
															-        // kv_mem = (state * k.unsqueeze(-1)).sum(dim=-2)
														
 
															-        
														
 
															-        // Reshape gated_state from [S_v, S_v*H_v, 1, n_seqs] to [S_v, S_v, H_v, n_seqs]
														
 
															-        // to make it compatible with k_expanded for element-wise multiplication
														
 
															-        ggml_tensor * gated_state_reshaped = ggml_reshape_4d(ctx, gated_state, S_v, S_v, H_v, n_seqs);
														
 
															-        cb(gated_state_reshaped, "gated_state_reshaped", il);
														
 
															-        
														
 
															-        ggml_tensor * state_k_product = ggml_mul(ctx, gated_state_reshaped, k_t);
														
 
															-        cb(state_k_product, "state_k_product", il);
														
 
															-
														
 
															-        ggml_tensor * kv_memory = ggml_sum_rows(ctx, ggml_cont(ctx, ggml_transpose(ctx, state_k_product)));
														
 
															-        cb(kv_memory, "kv_memory", il);
														
 
															-
														
 
															-        // Compute delta = (v - kv_memory) * beta
														
 
															-        ggml_tensor * v_diff = ggml_sub(ctx, v_t, kv_memory);
														
 
															-        ggml_tensor * delta = ggml_mul(ctx, v_diff, beta_t);
														
 
															-        cb(delta, "delta", il);
														
 
															-
														
 
															-        // Update state = state + k * delta
														
 
															-        // In the reference: last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2)
														
 
															-        ggml_tensor * delta_t = ggml_transpose(ctx, delta);
														
 
															-
														
 
															-        // Will need to broadcast here since GGML doesn't support auto-double-broadcasting on mul
														
 
															-        ggml_tensor * delta_t_broadcast = ggml_repeat_4d(ctx, delta_t, S_v, S_v, H_v, n_seqs);
														
 
															-        ggml_tensor * k_t_broadcast  = ggml_repeat_4d(ctx, k_t, S_v, S_v, H_v, n_seqs);
														
 
															-        ggml_tensor * k_delta_product = ggml_mul(ctx, k_t_broadcast, delta_t_broadcast);
														
 
															-        cb(k_delta_product, "k_delta", il);
														
 
															+    // Create result tensor with the same dimensions as delta_net
														
 
															+    const int64_t total_dims = (S_v * H_v * n_tokens * n_seqs) + (S_v * S_v * H_v * n_seqs);
														
 
															+    ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, total_dims);
														
 
															-        state = ggml_add(ctx, gated_state_reshaped, k_delta_product);
														
 
															-        cb(state, "updated_state", il);
														
 
															+    cb(q_tokens, "q_tokens", il);
														
 
															+    cb(k_tokens, "k_tokens", il);
														
 
															+    cb(v_tokens, "v_tokens", il);
														
 
															+    cb(g_tokens, "g_tokens", il);
														
 
															+    cb(beta_tokens, "beta_tokens", il);
														
 
															+    cb(g_tokens_exp, "g_tokens_exp", il);
														
 
															+    cb(state, "state_pre", il);
														
 
															-        ggml_tensor * state_q_product = ggml_mul(ctx, state, q_t);
														
 
															-        cb(state_q_product, "state_q_product", il);
														
 
															-        
														
 
															-        ggml_tensor * output = ggml_sum_rows(ctx, ggml_cont(ctx, ggml_transpose(ctx, state_q_product)));
														
 
															-        cb(output, "output", il);
														
 
															+    // Set operation parameters
														
 
															+    ggml_set_op_params_i32(result, 0, H_v);
														
 
															+    ggml_set_op_params_i32(result, 1, S_k);
														
 
															+    ggml_set_op_params_i32(result, 2, S_v);
														
 
															+    ggml_set_op_params_i32(result, 3, n_tokens); // Pass original n_tokens
														
 
															-        if (final_output == nullptr) {
														
 
															-            final_output = output;
														
 
															-        } else {
														
 
															-            final_output = ggml_concat(ctx, final_output, output, 0);
														
 
															-        }
														
 
															-    }
														
 
															+    // Set operation and source tensors
														
 
															+    result->op     = GGML_OP_DELTA_NET_RECURRENT;
														
 
															+    result->src[0] = q_tokens;
														
 
															+    result->src[1] = k_tokens;
														
 
															+    result->src[2] = v_tokens;
														
 
															+    result->src[3] = g_tokens_exp;
														
 
															+    result->src[4] = beta_tokens;
														
 
															+    result->src[5] = state;
														
 
															-    // Concatenate output and updated_state into a single tensor
														
 
															-    // First, flatten both tensors to 1D
														
 
															-    ggml_tensor * output_1d = ggml_cont_1d(ctx, final_output, ggml_nelements(final_output));
														
 
															-    ggml_tensor * updated_state_1d = ggml_cont_1d(ctx, state, ggml_nelements(state));
														
 
															-    
														
 
															-    // Concatenate them: [output, updated_state]
														
 
															-    ggml_tensor * result = ggml_concat(ctx, output_1d, updated_state_1d, 0);
														
 
															     return result;
														
 
															 }
														
@@ -604,7 +557,8 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
 
															     GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
														
 
															-    ggml_tensor * alpha_softplus = softplus(alpha, model.layers[il].ssm_dt);
														
 
															+    ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
														
 
															+    ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
														
 
															     cb(alpha_softplus, "a_softplus", il);
														
 
															     ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a);  // -A_log.exp() * softplus
														
 
															     cb(gate, "gate", il);
														
@@ -870,10 +824,3 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const llam
 
															     return cur;
														
 
															 }
														
 
															-ggml_tensor * llm_build_qwen3next::softplus(ggml_tensor * alpha, ggml_tensor * dt_bias) {
														
 
															-    ggml_tensor * alpha_biased   = ggml_add(ctx0, alpha, dt_bias);                // a + dt_bias
														
 
															-    ggml_tensor * alpha_exp      = ggml_exp(ctx0, alpha_biased);                  // exp(a + dt_bias)
														
 
															-    ggml_tensor * one_plus_exp   = ggml_scale_bias(ctx0, alpha_exp, 1.0f, 1.0f);  // 1 + exp(a + dt_bias)
														
 
															-    ggml_tensor * alpha_softplus = ggml_log(ctx0, one_plus_exp);                  // log(1 + exp(...))
														
 
															-    return alpha_softplus;
														
 
															-}
														
--- a/src/models/llm_build_qwen3next.h
+++ b/src/models/llm_build_qwen3next.h
@@ -51,8 +51,6 @@ private:
 
															     ggml_tensor * build_layer_ffn(ggml_tensor * cur, const llama_model & model, const int il);
														
 
															-    ggml_tensor * softplus(ggml_tensor * alpha, ggml_tensor * dt_bias);
														
 
															-
														
 
															     ggml_tensor * build_q3n_norm(struct ggml_tensor * input, struct ggml_tensor * weights, int layer);
														
 
															     ggml_tensor * build_q3n_gated_norm(struct ggml_tensor * input, struct ggml_tensor * weights, struct ggml_tensor * gate, int layer);
														
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3610,6 +3610,150 @@ struct test_cos : public test_case {
 
															     }
														
 
															 };
														
 
															+// GGML_OP_EXPM1
														
 
															+struct test_expm1 : public test_case {
														
 
															+    const ggml_type type;
														
 
															+    const std::array<int64_t, 4> ne;
														
 
															+
														
 
															+    std::string vars() override {
														
 
															+        return VARS_TO_STR2(type, ne);
														
 
															+    }
														
 
															+
														
 
															+    test_expm1(ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne = {10, 3, 3, 2})
														
 
															+        : type(type), ne(ne) {}
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        ggml_tensor * out = ggml_expm1(ctx, a);
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+
														
 
															+    void initialize_tensors(ggml_context * ctx) override {
														
 
															+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
														
 
															+            // Use small values to avoid overflow in expm1
														
 
															+            init_tensor_uniform(t, -2.0f, 2.0f);
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    bool grad_precise() override {
														
 
															+        return true;
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															+// GGML_OP_SOFTPLUS
														
 
															+struct test_softplus : public test_case {
														
 
															+    const ggml_type type;
														
 
															+    const std::array<int64_t, 4> ne;
														
 
															+
														
 
															+    std::string vars() override {
														
 
															+        return VARS_TO_STR2(type, ne);
														
 
															+    }
														
 
															+
														
 
															+    test_softplus(ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne = {10, 3, 3, 2})
														
 
															+        : type(type), ne(ne) {}
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        ggml_tensor * out = ggml_softplus(ctx, a);
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+
														
 
															+    void initialize_tensors(ggml_context * ctx) override {
														
 
															+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
														
 
															+            // Use values around the threshold (20) to test both branches of softplus
														
 
															+            init_tensor_uniform(t, -25.0f, 25.0f);
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    bool grad_precise() override {
														
 
															+        return true;
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															+// GGML_OP_EXPM1_INPLACE
														
 
															+struct test_expm1_inplace : public test_case {
														
 
															+    const ggml_type type;
														
 
															+    const std::array<int64_t, 4> ne;
														
 
															+
														
 
															+    std::string vars() override {
														
 
															+        return VARS_TO_STR2(type, ne);
														
 
															+    }
														
 
															+
														
 
															+    test_expm1_inplace(ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne = {10, 3, 3, 2})
														
 
															+        : type(type), ne(ne) {}
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        ggml_tensor * out = ggml_expm1_inplace(ctx, a);
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+
														
 
															+    void initialize_tensors(ggml_context * ctx) override {
														
 
															+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
														
 
															+            // Use small values to avoid overflow in expm1
														
 
															+            init_tensor_uniform(t, -2.0f, 2.0f);
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    bool grad_precise() override {
														
 
															+        return true;
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															+// GGML_OP_SOFTPLUS_INPLACE
														
 
															+struct test_softplus_inplace : public test_case {
														
 
															+    const ggml_type type;
														
 
															+    const std::array<int64_t, 4> ne;
														
 
															+
														
 
															+    std::string vars() override {
														
 
															+        return VARS_TO_STR2(type, ne);
														
 
															+    }
														
 
															+
														
 
															+    test_softplus_inplace(ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne = {10, 3, 3, 2})
														
 
															+        : type(type), ne(ne) {}
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        ggml_tensor * out = ggml_softplus_inplace(ctx, a);
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+
														
 
															+    void initialize_tensors(ggml_context * ctx) override {
														
 
															+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
														
 
															+            // Use values around the threshold (20) to test both branches of softplus
														
 
															+            init_tensor_uniform(t, -25.0f, 25.0f);
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    bool grad_precise() override {
														
 
															+        return true;
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															 // GGML_OP_CLAMP
														
 
															 struct test_clamp : public test_case {
														
 
															     const ggml_type type;
														
@@ -6332,6 +6476,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
															         test_cases.emplace_back(new test_sqr       (type));
														
 
															         test_cases.emplace_back(new test_sqrt      (type));
														
 
															         test_cases.emplace_back(new test_log       (type));
														
 
															+        test_cases.emplace_back(new test_expm1     (type));
														
 
															+        test_cases.emplace_back(new test_softplus  (type));
														
 
															+        test_cases.emplace_back(new test_expm1_inplace     (type));
														
 
															+        test_cases.emplace_back(new test_softplus_inplace  (type));
														
 
															         test_cases.emplace_back(new test_sin       (type));
														
 
															         test_cases.emplace_back(new test_cos       (type));
														
 
															         test_cases.emplace_back(new test_clamp     (type));
														
@@ -6339,6 +6487,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
															         test_cases.emplace_back(new test_sqr       (type, {7, 1, 5, 3}));
														
 
															         test_cases.emplace_back(new test_sqrt      (type, {7, 1, 5, 3}));
														
 
															         test_cases.emplace_back(new test_log       (type, {7, 1, 5, 3}));
														
 
															+        test_cases.emplace_back(new test_expm1     (type, {7, 1, 5, 3}));
														
 
															+        test_cases.emplace_back(new test_softplus  (type, {7, 1, 5, 3}));
														
 
															+        test_cases.emplace_back(new test_expm1_inplace     (type, {7, 1, 5, 3}));
														
 
															+        test_cases.emplace_back(new test_softplus_inplace  (type, {7, 1, 5, 3}));
														
 
															         test_cases.emplace_back(new test_sin       (type, {7, 1, 5, 3}));
														
 
															         test_cases.emplace_back(new test_cos       (type, {7, 1, 5, 3}));
														
 
															         test_cases.emplace_back(new test_clamp     (type, {7, 1, 5, 3}));