2 месяцев назад · 389ac78b26
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -18,7 +18,7 @@ Legend:
 
															 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
														
 
															 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
														
 
															-|                           ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
														
 
															 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
														
@@ -26,9 +26,9 @@ Legend:
 
															 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
														
 
															 |                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
														
 
															 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
														
 
															-|                          CONV_2D | ❌ | ❌ | ✅ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
														
 
															+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
														
 
															 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
														
 
															-|                          CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
														
@@ -36,11 +36,14 @@ Legend:
 
															 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
														
 
															 |               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															+|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
														
 
															 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
														
 
															 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
														
 
															 |                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
														
 
															+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															+|                             FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
														
 
															 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
														
@@ -57,7 +60,7 @@ Legend:
 
															 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
														
 
															-|                        IM2COL_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															+|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
														
@@ -69,10 +72,10 @@ Legend:
 
															 |                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
														
 
															 |                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
														
 
															 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
														
 
															-|                     OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															+|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
														
 
															-|                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
														
 
															-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
														
 
															+|                              PAD | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
														
 
															+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
														
 
															 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
														
 
															 |                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
														
@@ -80,15 +83,15 @@ Legend:
 
															 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
														
 
															 |                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
														
 
															-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
														
 
															-|                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ |
														
 
															+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ |
														
 
															+|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
														
 
															 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
														
 
															 |                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
														
 
															-|                              SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | ❌ | ❌ |
														
 
															+|                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
														
 
															 |                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
														
@@ -96,21 +99,24 @@ Legend:
 
															 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
														
 
															 |                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
														
 
															 |                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
														
 
															+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
														
 
															 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
														
 
															+|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
														
 
															 |                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
														
 
															 |                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
														
 
															 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
														
 
															 |                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
														
 
															-|                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
														
 
															-|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
														
 
															+|                              SUM | ❌ | ✅ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
														
 
															+|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
														
 
															 |                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
														
 
															-|                       SWIGLU_OAI | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
														
 
															 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
														
 
															 |                         TOPK_MOE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
														
 
															+|                              TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
														
 
															 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
														
 
															-|                            XIELU | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
 
															+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
														
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -475,6 +475,7 @@ extern "C" {
 
															         GGML_OP_COS,
														
 
															         GGML_OP_SUM,
														
 
															         GGML_OP_SUM_ROWS,
														
 
															+        GGML_OP_CUMSUM,
														
 
															         GGML_OP_MEAN,
														
 
															         GGML_OP_ARGMAX,
														
 
															         GGML_OP_COUNT_EQUAL,
														
@@ -530,6 +531,8 @@ extern "C" {
 
															         GGML_OP_TIMESTEP_EMBEDDING,
														
 
															         GGML_OP_ARGSORT,
														
 
															         GGML_OP_LEAKY_RELU,
														
 
															+        GGML_OP_TRI,
														
 
															+        GGML_OP_FILL,
														
 
															         GGML_OP_FLASH_ATTN_EXT,
														
 
															         GGML_OP_FLASH_ATTN_BACK,
														
@@ -542,6 +545,7 @@ extern "C" {
 
															         GGML_OP_RWKV_WKV6,
														
 
															         GGML_OP_GATED_LINEAR_ATTN,
														
 
															         GGML_OP_RWKV_WKV7,
														
 
															+        GGML_OP_SOLVE_TRI,
														
 
															         GGML_OP_UNARY,
														
@@ -576,6 +580,8 @@ extern "C" {
 
															         GGML_UNARY_OP_HARDSWISH,
														
 
															         GGML_UNARY_OP_HARDSIGMOID,
														
 
															         GGML_UNARY_OP_EXP,
														
 
															+        GGML_UNARY_OP_EXPM1,
														
 
															+        GGML_UNARY_OP_SOFTPLUS,
														
 
															         GGML_UNARY_OP_GELU_ERF,
														
 
															         GGML_UNARY_OP_XIELU,
														
 
															         GGML_UNARY_OP_FLOOR,
														
@@ -620,6 +626,13 @@ extern "C" {
 
															         GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
														
 
															     };
														
 
															+    enum ggml_tri_type {
														
 
															+        GGML_TRI_TYPE_UPPER_DIAG = 0,
														
 
															+        GGML_TRI_TYPE_UPPER      = 1,
														
 
															+        GGML_TRI_TYPE_LOWER_DIAG = 2,
														
 
															+        GGML_TRI_TYPE_LOWER      = 3
														
 
															+    };
														
 
															+
														
 
															     struct ggml_init_params {
														
 
															         // memory pool
														
 
															         size_t mem_size;   // bytes
														
@@ -957,6 +970,22 @@ extern "C" {
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * a);
														
 
															+    GGML_API struct ggml_tensor * ggml_expm1(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_expm1_inplace(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_softplus(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_softplus_inplace(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a);
														
 
															+
														
 
															     GGML_API struct ggml_tensor * ggml_sin(
														
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * a);
														
@@ -983,6 +1012,10 @@ extern "C" {
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * a);
														
 
															+    GGML_API struct ggml_tensor * ggml_cumsum(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a);
														
 
															+
														
 
															     // mean along rows
														
 
															     GGML_API struct ggml_tensor * ggml_mean(
														
 
															             struct ggml_context * ctx,
														
@@ -2187,6 +2220,23 @@ extern "C" {
 
															             int                   shift2,
														
 
															             int                   shift3);
														
 
															+    // Convert matrix into a triangular one (upper, strict upper, lower or strict lower) by writing
														
 
															+    // zeroes everywhere outside the masked area
														
 
															+    GGML_API struct ggml_tensor * ggml_tri(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a,
														
 
															+            enum ggml_tri_type    type);
														
 
															+
														
 
															+    // Fill tensor a with constant c
														
 
															+    GGML_API struct ggml_tensor * ggml_fill(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a,
														
 
															+            float                 c);
														
 
															+
														
 
															+    GGML_API struct ggml_tensor * ggml_fill_inplace(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a,
														
 
															+            float                 c);
														
 
															     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
														
 
															     // timesteps: [N,]
														
@@ -2356,6 +2406,27 @@ extern "C" {
 
															             struct ggml_tensor  * b,
														
 
															             struct ggml_tensor  * state);
														
 
															+    /* Solves a specific equation of the form Ax=B, where A is a triangular matrix
														
 
															+    *  without zeroes on the diagonal (i.e. invertible).
														
 
															+    *  B can have any number of columns, but must have the same number of rows as A
														
 
															+    *  If A is [n, n] and B is [n, m], then the result will be [n, m] as well
														
 
															+    *  Has O(n^3) complexity (unlike most matrix ops out there), so use on cases
														
 
															+    *  where n > 100 sparingly, pre-chunk if necessary.
														
 
															+    *
														
 
															+    *  If left = false, solves xA=B instead
														
 
															+    *  If lower = false, assumes upper triangular instead
														
 
															+    *  If uni = true, assumes diagonal of A to be all ones (will override actual values)
														
 
															+    *
														
 
															+    *  TODO: currently only lower, right, non-unitriangular variant is implemented
														
 
															+    */
														
 
															+    GGML_API struct ggml_tensor * ggml_solve_tri(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a,
														
 
															+        struct ggml_tensor  * b,
														
 
															+        bool                  left,
														
 
															+        bool                  lower,
														
 
															+        bool                  uni);
														
 
															+
														
 
															     // custom operators
														
 
															     typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
														
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1731,6 +1731,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
															             {
														
 
															                 ggml_compute_forward_sum_rows(params, tensor);
														
 
															             } break;
														
 
															+        case GGML_OP_CUMSUM:
														
 
															+            {
														
 
															+                ggml_compute_forward_cumsum(params, tensor);
														
 
															+            } break;
														
 
															         case GGML_OP_MEAN:
														
 
															             {
														
 
															                 ggml_compute_forward_mean(params, tensor);
														
@@ -1927,6 +1931,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
															             {
														
 
															                 ggml_compute_forward_leaky_relu(params, tensor);
														
 
															             } break;
														
 
															+        case GGML_OP_TRI:
														
 
															+            {
														
 
															+                ggml_compute_forward_tri(params, tensor);
														
 
															+            } break;
														
 
															+        case GGML_OP_FILL:
														
 
															+            {
														
 
															+                ggml_compute_forward_fill(params, tensor);
														
 
															+            } break;
														
 
															         case GGML_OP_FLASH_ATTN_EXT:
														
 
															             {
														
 
															                 ggml_compute_forward_flash_attn_ext(params, tensor);
														
@@ -1982,6 +1994,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
															             {
														
 
															                 ggml_compute_forward_rwkv_wkv7(params, tensor);
														
 
															             } break;
														
 
															+        case GGML_OP_SOLVE_TRI:
														
 
															+            {
														
 
															+                ggml_compute_forward_solve_tri(params, tensor);
														
 
															+            } break;
														
 
															         case GGML_OP_MAP_CUSTOM1:
														
 
															             {
														
 
															                 ggml_compute_forward_map_custom1(params, tensor);
														
@@ -2140,6 +2156,9 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
															         case GGML_OP_ADD_ID:
														
 
															         case GGML_OP_ADD1:
														
 
															         case GGML_OP_ACC:
														
 
															+        case GGML_OP_CUMSUM:
														
 
															+        case GGML_OP_TRI:
														
 
															+        case GGML_OP_FILL:
														
 
															             {
														
 
															                 n_tasks = n_threads;
														
 
															             } break;
														
@@ -2157,6 +2176,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
															                 n_tasks = 1;
														
 
															             } break;
														
 
															         case GGML_OP_COUNT_EQUAL:
														
 
															+        case GGML_OP_SOLVE_TRI:
														
 
															             {
														
 
															                 n_tasks = n_threads;
														
 
															             } break;
														
@@ -2179,6 +2199,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
															                 case GGML_UNARY_OP_HARDSWISH:
														
 
															                 case GGML_UNARY_OP_HARDSIGMOID:
														
 
															                 case GGML_UNARY_OP_EXP:
														
 
															+                case GGML_UNARY_OP_SOFTPLUS:
														
 
															+                case GGML_UNARY_OP_EXPM1:
														
 
															                 case GGML_UNARY_OP_FLOOR:
														
 
															                 case GGML_UNARY_OP_CEIL:
														
 
															                 case GGML_UNARY_OP_ROUND:
														
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -9,6 +9,7 @@
 
															 #include <cfloat>
														
 
															 #include <algorithm>
														
 
															+#include <cmath>
														
 
															 #include <functional>
														
 
															 // ggml_compute_forward_dup
														
@@ -1395,6 +1396,56 @@ void ggml_compute_forward_sum(
 
															     }
														
 
															 }
														
 
															+// ggml_compute_forward_cumsum
														
 
															+
														
 
															+static void ggml_compute_forward_cumsum_f32(
														
 
															+        const ggml_compute_params * params,
														
 
															+        ggml_tensor * dst) {
														
 
															+
														
 
															+    const ggml_tensor * src0 = dst->src[0];
														
 
															+
														
 
															+    GGML_ASSERT(src0->nb[0] == sizeof(float));
														
 
															+    GGML_ASSERT(dst->nb[0] == sizeof(float));
														
 
															+
														
 
															+    GGML_TENSOR_UNARY_OP_LOCALS
														
 
															+
														
 
															+    GGML_ASSERT(ne0 == ne00);
														
 
															+    GGML_ASSERT(ne1 == ne01);
														
 
															+    GGML_ASSERT(ne2 == ne02);
														
 
															+    GGML_ASSERT(ne3 == ne03);
														
 
															+
														
 
															+    const auto [ir0, ir1] = get_thread_range(params, src0);
														
 
															+
														
 
															+    for (int64_t ir = ir0; ir < ir1; ++ir) {
														
 
															+        const int64_t i03 = ir/(ne02*ne01);
														
 
															+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
														
 
															+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
														
 
															+
														
 
															+        float * src_row = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
														
 
															+        float * dst_row = (float *) ((char *) dst->data  + i01*nb1  + i02*nb2  + i03*nb3);
														
 
															+
														
 
															+        ggml_vec_cumsum_f32(ne00, dst_row, src_row);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+void ggml_compute_forward_cumsum(
														
 
															+        const ggml_compute_params * params,
														
 
															+        ggml_tensor * dst) {
														
 
															+
														
 
															+    const ggml_tensor * src0 = dst->src[0];
														
 
															+
														
 
															+    switch (src0->type) {
														
 
															+        case GGML_TYPE_F32:
														
 
															+            {
														
 
															+                ggml_compute_forward_cumsum_f32(params, dst);
														
 
															+            } break;
														
 
															+        default:
														
 
															+            {
														
 
															+                GGML_ABORT("fatal error");
														
 
															+            }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 // ggml_compute_forward_sum_rows
														
 
															 static void ggml_compute_forward_sum_rows_f32(
														
@@ -2141,6 +2192,83 @@ static void ggml_compute_forward_gelu(
 
															     }
														
 
															 }
														
 
															+// ggml_compute_fill
														
 
															+
														
 
															+static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    const float c = ggml_get_op_params_f32(dst, 0);
														
 
															+
														
 
															+    GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
														
 
															+    GGML_TENSOR_LOCALS(size_t,  nb, dst, nb);
														
 
															+
														
 
															+    const auto [ir0, ir1] = get_thread_range(params, dst);
														
 
															+
														
 
															+    for (int64_t ir = ir0; ir < ir1; ++ir) {
														
 
															+        const int64_t i03 = ir/(ne2*ne1);
														
 
															+        const int64_t i02 = (ir - i03*ne2*ne1)/ne1;
														
 
															+        const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1);
														
 
															+
														
 
															+        float * dst_ptr  = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
														
 
															+
														
 
															+        ggml_vec_set_f32(ne0, dst_ptr, c);
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    ggml_compute_forward_fill_f32(params, dst);
														
 
															+}
														
 
															+
														
 
															+// ggml_compute_tri
														
 
															+
														
 
															+static void ggml_compute_forward_tri_f32(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    const ggml_tensor * src0 = dst->src[0];
														
 
															+
														
 
															+    const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
														
 
															+
														
 
															+    GGML_ASSERT(ggml_is_contiguous(src0));
														
 
															+
														
 
															+    GGML_TENSOR_UNARY_OP_LOCALS
														
 
															+
														
 
															+    const auto [ir0, ir1] = get_thread_range(params, src0);
														
 
															+
														
 
															+    bool (*bipred)(int, int);
														
 
															+
														
 
															+    switch (ttype) {
														
 
															+        case GGML_TRI_TYPE_LOWER:      bipred = [](int i, int r) { return i <  r; }; break;
														
 
															+        case GGML_TRI_TYPE_LOWER_DIAG: bipred = [](int i, int r) { return i <= r; }; break;
														
 
															+        case GGML_TRI_TYPE_UPPER:      bipred = [](int i, int r) { return i >  r; }; break;
														
 
															+        case GGML_TRI_TYPE_UPPER_DIAG: bipred = [](int i, int r) { return i >= r; }; break;
														
 
															+        default: GGML_ABORT("invalid tri type");
														
 
															+    }
														
 
															+
														
 
															+    for (int64_t ir = ir0; ir < ir1; ++ir) {
														
 
															+        const int64_t i03 = ir/(ne02*ne01);
														
 
															+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
														
 
															+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
														
 
															+
														
 
															+        const float * src_ptr = (const float  *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
														
 
															+              float * dst_ptr = (      float  *) ((      char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1);
														
 
															+
														
 
															+        for (int i0 = 0; i0 < ne0; ++i0) {
														
 
															+            dst_ptr[i0] = bipred(i0, i01) ? src_ptr[i0] : 0.0f;
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+void ggml_compute_forward_tri(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    const ggml_tensor * src0 = dst->src[0];
														
 
															+
														
 
															+    switch (src0->type) {
														
 
															+        case GGML_TYPE_F32:
														
 
															+            {
														
 
															+                ggml_compute_forward_tri_f32(params, dst);
														
 
															+            } break;
														
 
															+        default:
														
 
															+            {
														
 
															+                GGML_ABORT("fatal error");
														
 
															+            }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 // ggml_compute_forward_gelu_erf
														
 
															 static void ggml_compute_forward_gelu_erf_f32(
														
@@ -8536,7 +8664,7 @@ static void ggml_compute_forward_ssm_scan_f32(
 
															                 // n_head
														
 
															                 for (int h = ih0; h < ih1; ++h) {
														
 
															                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
														
 
															-                    const float dt_soft_plus = ggml_softplus(dt[h]);
														
 
															+                    const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);
														
 
															                     const float dA = expf(dt_soft_plus * A[h]);
														
 
															                     const int g = h / (nh / ng); // repeat_interleave
														
@@ -8633,7 +8761,7 @@ static void ggml_compute_forward_ssm_scan_f32(
 
															                 // n_head
														
 
															                 for (int h = ih0; h < ih1; ++h) {
														
 
															                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
														
 
															-                    const float dt_soft_plus = ggml_softplus(dt[h]);
														
 
															+                    const float dt_soft_plus = ggml_compute_softplus_f32(dt[h]);
														
 
															                     const int g = h / (nh / ng); // repeat_interleave
														
 
															                     // dim
														
@@ -8916,6 +9044,14 @@ void ggml_compute_forward_unary(
 
															             {
														
 
															                 ggml_compute_forward_xielu(params, dst);
														
 
															             } break;
														
 
															+        case GGML_UNARY_OP_EXPM1:
														
 
															+            {
														
 
															+                ggml_compute_forward_expm1(params, dst);
														
 
															+            } break;
														
 
															+        case GGML_UNARY_OP_SOFTPLUS:
														
 
															+            {
														
 
															+                ggml_compute_forward_softplus(params, dst);
														
 
															+            } break;
														
 
															         default:
														
 
															             {
														
 
															                 GGML_ABORT("fatal error");
														
@@ -9512,6 +9648,76 @@ void ggml_compute_forward_gla(
 
															     }
														
 
															 }
														
 
															+static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst) {
														
 
															+    const struct ggml_tensor * src0 = dst->src[0];  // A (lower triangular)
														
 
															+    const struct ggml_tensor * src1 = dst->src[1];  // B (RHS)
														
 
															+
														
 
															+    GGML_TENSOR_BINARY_OP_LOCALS;
														
 
															+
														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
														
 
															+
														
 
															+    GGML_ASSERT(ne00 == ne01); // A must be square
														
 
															+    GGML_ASSERT(ne0  == ne10); // solution cols == B cols
														
 
															+    GGML_ASSERT(ne1  == ne11); // solution rows == B rows
														
 
															+
														
 
															+    GGML_ASSERT(ne02 == ne12 && ne12 == ne2);
														
 
															+    GGML_ASSERT(ne03 == ne13 && ne13 == ne3);
														
 
															+
														
 
															+    const int ith = params->ith;
														
 
															+    const int nth = params->nth;
														
 
															+
														
 
															+    const int64_t k = ne10;   // number of RHS columns
														
 
															+    const int64_t n = ne11;   // A is n×n
														
 
															+    const int64_t nr = ne02 * ne03 * k; // we're parallelizing on columns here, so seq x token x column will be the unit
														
 
															+
														
 
															+    // chunks per thread
														
 
															+    const int64_t dr = (nr + nth - 1)/nth;
														
 
															+
														
 
															+    // chunk range for this thread
														
 
															+    const int64_t ir0 = dr*ith;
														
 
															+    const int64_t ir1 = MIN(ir0 + dr, nr);
														
 
															+
														
 
															+    const float * A = (const float *) src0->data;  // [n, n, B1, B2]
														
 
															+    const float * B = (const float *) src1->data;  // [n, k, B1, B2]
														
 
															+          float * X = (      float *) dst->data;   // [n, k, B1, B2]
														
 
															+
														
 
															+    for (int64_t ir = ir0; ir < ir1; ++ir) {
														
 
															+        const int64_t i03 = ir/(ne02*k);
														
 
															+        const int64_t i02 = (ir - i03*ne02*k)/k;
														
 
															+        const int64_t i01 = (ir - i03*ne02*k - i02*k);
														
 
															+
														
 
															+        const float * A_batch = A + i02 * nb02 / sizeof(float) + i03 * nb03 / sizeof(float);
														
 
															+        const float * B_batch = B + i02 * nb12 / sizeof(float) + i03 * nb13 / sizeof(float);
														
 
															+
														
 
															+        float * X_batch = X + i02 * nb2 / sizeof(float) + i03 * nb3 / sizeof(float);
														
 
															+
														
 
															+        for (int64_t i00 = 0; i00 < n; ++i00) {
														
 
															+            float sum = 0.0f;
														
 
															+            for (int64_t t = 0; t < i00; ++t) {
														
 
															+                sum += A_batch[i00 * n + t] * X_batch[i01 * n + t];
														
 
															+            }
														
 
															+
														
 
															+            const float diag = A_batch[i00 * n + i00];
														
 
															+            GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
														
 
															+
														
 
															+            X_batch[i01 * n + i00] = (B_batch[i00 * k + i01] - sum) / diag;
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst) {
														
 
															+    const ggml_tensor * src0 = dst->src[0];
														
 
															+    const ggml_tensor * src1 = dst->src[1];
														
 
															+
														
 
															+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
														
 
															+        ggml_compute_forward_solve_tri_f32(params, dst);
														
 
															+    } else {
														
 
															+        GGML_ABORT("fatal error");
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 // ggml_compute_forward_rwkv_wkv7
														
 
															 static void ggml_compute_forward_rwkv_wkv7_f32(
														
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -34,6 +34,7 @@ void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct
 
															 void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_cumsum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
@@ -81,6 +82,8 @@ void ggml_compute_forward_arange(const struct ggml_compute_params * params, stru
 
															 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_fill(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_flash_attn_back(
														
 
															         const struct ggml_compute_params * params,
														
@@ -96,6 +99,7 @@ void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params,
 
															 void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
--- a/ggml/src/ggml-cpu/unary-ops.cpp
+++ b/ggml/src/ggml-cpu/unary-ops.cpp
@@ -73,6 +73,14 @@ static inline float op_log(float x) {
 
															     return logf(x);
														
 
															 }
														
 
															+static inline float op_expm1(float x) {
														
 
															+    return expf(x) - 1.0f;
														
 
															+}
														
 
															+
														
 
															+static inline float op_softplus(float x) {
														
 
															+    return (x > 20.0f) ? x : logf(1.0f + expf(x));
														
 
															+}
														
 
															+
														
 
															 static inline float op_floor(float x) {
														
 
															     return floorf(x);
														
 
															 }
														
@@ -290,6 +298,14 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
 
															     unary_op<op_log>(params, dst);
														
 
															 }
														
 
															+void ggml_compute_forward_expm1(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    unary_op<op_expm1>(params, dst);
														
 
															+}
														
 
															+
														
 
															+void ggml_compute_forward_softplus(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															+    unary_op<op_softplus>(params, dst);
														
 
															+}
														
 
															+
														
 
															 void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
														
 
															     unary_op<op_floor>(params, dst);
														
 
															 }
														
--- a/ggml/src/ggml-cpu/unary-ops.h
+++ b/ggml/src/ggml-cpu/unary-ops.h
@@ -22,6 +22,8 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
 
															 void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_expm1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															+void ggml_compute_forward_softplus(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
 
															 void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
														
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -1416,6 +1416,16 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 
															 #endif
														
 
															 }
														
 
															+inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
														
 
															+    for (int i = 0; i < n; ++i) {
														
 
															+        if (i == 0) {
														
 
															+            y[i] = x[i];
														
 
															+        } else {
														
 
															+            y[i] = y[i - 1] + x[i];
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
														
 
															     ggml_float sum = 0.0;
														
 
															     for (int i = 0; i < n; ++i) {
														
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2527,6 +2527,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
															                 case GGML_UNARY_OP_TRUNC:
														
 
															                     ggml_cuda_op_trunc(ctx, dst);
														
 
															                     break;
														
 
															+                case GGML_UNARY_OP_EXPM1:
														
 
															+                    ggml_cuda_op_expm1(ctx, dst);
														
 
															+                    break;
														
 
															+                case GGML_UNARY_OP_SOFTPLUS:
														
 
															+                    ggml_cuda_op_softplus(ctx, dst);
														
 
															+                    break;
														
 
															                 default:
														
 
															                     return false;
														
 
															             }
														
@@ -3829,6 +3835,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 
															                 case GGML_UNARY_OP_GELU_QUICK:
														
 
															                 case GGML_UNARY_OP_TANH:
														
 
															                 case GGML_UNARY_OP_EXP:
														
 
															+                case GGML_UNARY_OP_EXPM1:
														
 
															+                case GGML_UNARY_OP_SOFTPLUS:
														
 
															                 case GGML_UNARY_OP_ELU:
														
 
															                 case GGML_UNARY_OP_FLOOR:
														
 
															                 case GGML_UNARY_OP_CEIL:
														
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -81,6 +81,14 @@ static __device__ __forceinline__ float op_log(float x) {
 
															     return logf(x);
														
 
															 }
														
 
															+static __device__ __forceinline__ float op_expm1(float x) {
														
 
															+    return expm1f(x);
														
 
															+}
														
 
															+
														
 
															+static __device__ __forceinline__ float op_softplus(float x) {
														
 
															+    return (x > 20.0f) ? x : logf(1.0f + expf(x));
														
 
															+}
														
 
															+
														
 
															 static __device__ __forceinline__ float op_elu(float x) {
														
 
															     return (x > 0.f) ? x : expm1f(x);
														
 
															 }
														
@@ -233,6 +241,14 @@ void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
															 void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															     ggml_cuda_op_unary<op_trunc>(ctx, dst);
														
 
															 }
														
 
															+
														
 
															+void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															+    ggml_cuda_op_unary<op_expm1>(ctx, dst);
														
 
															+}
														
 
															+
														
 
															+void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															+    ggml_cuda_op_unary<op_softplus>(ctx, dst);
														
 
															+}
														
 
															 /* gated ops */
														
 
															 template <float (*op)(float), typename T>
														
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -61,6 +61,10 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
															 void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+
														
 
															+void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+
														
 
															 void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															 void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -102,7 +102,7 @@ static bool ggml_op_is_empty(enum ggml_op op) {
 
															     }
														
 
															 }
														
 
															-static inline float ggml_softplus(float input) {
														
 
															+static inline float ggml_compute_softplus_f32(float input) {
														
 
															     return (input > 20.0f) ? input : logf(1 + expf(input));
														
 
															 }
														
 
															 //
														
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -935,6 +935,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
															     "COS",
														
 
															     "SUM",
														
 
															     "SUM_ROWS",
														
 
															+    "CUMSUM",
														
 
															     "MEAN",
														
 
															     "ARGMAX",
														
 
															     "COUNT_EQUAL",
														
@@ -990,6 +991,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
															     "TIMESTEP_EMBEDDING",
														
 
															     "ARGSORT",
														
 
															     "LEAKY_RELU",
														
 
															+    "TRI",
														
 
															+    "FILL",
														
 
															     "FLASH_ATTN_EXT",
														
 
															     "FLASH_ATTN_BACK",
														
@@ -1002,6 +1005,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
															     "RWKV_WKV6",
														
 
															     "GATED_LINEAR_ATTN",
														
 
															     "RWKV_WKV7",
														
 
															+    "SOLVE_TRI",
														
 
															     "UNARY",
														
@@ -1019,7 +1023,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
															     "GLU",
														
 
															 };
														
 
															-static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
														
 
															+static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94");
														
 
															 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
														
 
															     "none",
														
@@ -1039,6 +1043,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
															     "cos(x)",
														
 
															     "Σx",
														
 
															     "Σx_k",
														
 
															+    "cumsum(x)",
														
 
															     "Σx/n",
														
 
															     "argmax(x)",
														
 
															     "count_equal(x)",
														
@@ -1094,6 +1099,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
															     "timestep_embedding(timesteps, dim, max_period)",
														
 
															     "argsort(x)",
														
 
															     "leaky_relu(x)",
														
 
															+    "tri(x)",
														
 
															+    "fill(x, c)",
														
 
															     "flash_attn_ext(x)",
														
 
															     "flash_attn_back(x)",
														
@@ -1106,6 +1113,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
															     "rwkv_wkv6(k, v, r, tf, td, s)",
														
 
															     "gated_linear_attn(k, v, q, gate, s)",
														
 
															     "rwkv_wkv7(r, w, k, v, a, b, s)",
														
 
															+    "A X = B, A triangular, solve X",
														
 
															     "unary(x)",
														
@@ -1123,7 +1131,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
															     "glu(x)",
														
 
															 };
														
 
															-static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
														
 
															+static_assert(GGML_OP_COUNT == 94, "GGML_OP_COUNT != 94");
														
 
															 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
														
@@ -1142,6 +1150,8 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
 
															     "HARDSWISH",
														
 
															     "HARDSIGMOID",
														
 
															     "EXP",
														
 
															+    "EXPM1",
														
 
															+    "SOFTPLUS",
														
 
															     "GELU_ERF",
														
 
															     "XIELU",
														
 
															     "FLOOR",
														
@@ -1150,7 +1160,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
 
															     "TRUNC",
														
 
															 };
														
 
															-static_assert(GGML_UNARY_OP_COUNT == 20, "GGML_UNARY_OP_COUNT != 20");
														
 
															+static_assert(GGML_UNARY_OP_COUNT == 22, "GGML_UNARY_OP_COUNT != 22");
														
 
															 static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
														
 
															     "REGLU",
														
@@ -2258,6 +2268,30 @@ struct ggml_tensor * ggml_log_inplace(
 
															     return ggml_log_impl(ctx, a, true);
														
 
															 }
														
 
															+struct ggml_tensor * ggml_expm1(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    return ggml_unary(ctx, a, GGML_UNARY_OP_EXPM1);
														
 
															+}
														
 
															+
														
 
															+struct ggml_tensor * ggml_expm1_inplace(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXPM1);
														
 
															+}
														
 
															+
														
 
															+struct ggml_tensor * ggml_softplus(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    return ggml_unary(ctx, a, GGML_UNARY_OP_SOFTPLUS);
														
 
															+}
														
 
															+
														
 
															+struct ggml_tensor * ggml_softplus_inplace(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SOFTPLUS);
														
 
															+}
														
 
															+
														
 
															 // ggml_sin
														
 
															 static struct ggml_tensor * ggml_sin_impl(
														
@@ -2341,6 +2375,21 @@ struct ggml_tensor * ggml_sum_rows(
 
															     return result;
														
 
															 }
														
 
															+// ggml_cumsum
														
 
															+
														
 
															+struct ggml_tensor * ggml_cumsum(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a) {
														
 
															+    GGML_ASSERT(a->type == GGML_TYPE_F32);
														
 
															+
														
 
															+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
														
 
															+
														
 
															+    result->op     = GGML_OP_CUMSUM;
														
 
															+    result->src[0] = a;
														
 
															+
														
 
															+    return result;
														
 
															+}
														
 
															+
														
 
															 // ggml_mean
														
 
															 struct ggml_tensor * ggml_mean(
														
@@ -2668,8 +2717,8 @@ struct ggml_tensor * ggml_xielu(
 
															     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
														
 
															     ggml_set_op_params_i32(result, 0, (int32_t) GGML_UNARY_OP_XIELU);
														
 
															-    ggml_set_op_params_f32(result, 1, beta + ggml_softplus(alpha_n));
														
 
															-    ggml_set_op_params_f32(result, 2, ggml_softplus(alpha_p));
														
 
															+    ggml_set_op_params_f32(result, 1, beta + ggml_compute_softplus_f32(alpha_n));
														
 
															+    ggml_set_op_params_f32(result, 2, ggml_compute_softplus_f32(alpha_p));
														
 
															     ggml_set_op_params_f32(result, 3, beta);
														
 
															     ggml_set_op_params_f32(result, 4, eps);
														
@@ -5028,6 +5077,61 @@ struct ggml_tensor * ggml_timestep_embedding(
 
															     return result;
														
 
															 }
														
 
															+// ggml_tri
														
 
															+
														
 
															+struct ggml_tensor * ggml_tri(
														
 
															+    struct ggml_context * ctx,
														
 
															+    struct ggml_tensor  * a,
														
 
															+    enum ggml_tri_type    type) {
														
 
															+    GGML_ASSERT(a->type == GGML_TYPE_F32);
														
 
															+
														
 
															+    GGML_ASSERT(ggml_is_contiguous(a));
														
 
															+    GGML_ASSERT(a->ne[0] == a->ne[1]);
														
 
															+
														
 
															+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
														
 
															+
														
 
															+    ggml_set_op_params_i32(result, 0, type);
														
 
															+
														
 
															+    result->op = GGML_OP_TRI;
														
 
															+    result->src[0] = a;
														
 
															+
														
 
															+    return result;
														
 
															+}
														
 
															+
														
 
															+// ggml_fill
														
 
															+
														
 
															+static struct ggml_tensor * ggml_fill_impl(
														
 
															+    struct ggml_context * ctx,
														
 
															+    struct ggml_tensor  * a,
														
 
															+    float                 c,
														
 
															+    bool                  inplace) {
														
 
															+    GGML_ASSERT(a->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(ggml_is_contiguous(a));
														
 
															+
														
 
															+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
														
 
															+
														
 
															+    ggml_set_op_params_f32(result, 0, c);
														
 
															+
														
 
															+    result->op = GGML_OP_FILL;
														
 
															+    result->src[0] = a;
														
 
															+
														
 
															+    return result;
														
 
															+}
														
 
															+
														
 
															+struct ggml_tensor * ggml_fill(
														
 
															+    struct ggml_context * ctx,
														
 
															+    struct ggml_tensor  * a,
														
 
															+    float                 c) {
														
 
															+    return ggml_fill_impl(ctx, a, c, false);
														
 
															+}
														
 
															+
														
 
															+struct ggml_tensor * ggml_fill_inplace(
														
 
															+    struct ggml_context * ctx,
														
 
															+    struct ggml_tensor  * a,
														
 
															+    float                 c) {
														
 
															+    return ggml_fill_impl(ctx, a, c, true);
														
 
															+}
														
 
															+
														
 
															 // ggml_argsort
														
 
															 struct ggml_tensor * ggml_argsort(
														
@@ -5882,6 +5986,41 @@ struct ggml_tensor * ggml_opt_step_sgd(
 
															     return result;
														
 
															 }
														
 
															+// solve_tri
														
 
															+
														
 
															+struct ggml_tensor * ggml_solve_tri(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a,
														
 
															+        struct ggml_tensor  * b,
														
 
															+        bool                  left,
														
 
															+        bool                  lower,
														
 
															+        bool                  uni) {
														
 
															+    GGML_ASSERT(a->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(b->type == GGML_TYPE_F32);
														
 
															+
														
 
															+    // A must be square and lower diagonal
														
 
															+    GGML_ASSERT(a->ne[0] == a->ne[1]);
														
 
															+    // B must have same outer dimension as A
														
 
															+    GGML_ASSERT(a->ne[1] == b->ne[1]);
														
 
															+
														
 
															+    // batch dimensions must be equal
														
 
															+    GGML_ASSERT(a->ne[2] == b->ne[2]);
														
 
															+    GGML_ASSERT(a->ne[3] == b->ne[3]);
														
 
															+
														
 
															+    GGML_ASSERT(ggml_is_contiguous(a));
														
 
															+    GGML_ASSERT(ggml_is_contiguous(b));
														
 
															+
														
 
															+    GGML_ASSERT(lower && left && !uni); // TODO: support other variants
														
 
															+
														
 
															+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, b->ne[0], b->ne[1], b->ne[2], b->ne[3]);
														
 
															+
														
 
															+    result->op     = GGML_OP_SOLVE_TRI;
														
 
															+    result->src[0] = a;
														
 
															+    result->src[1] = b;
														
 
															+
														
 
															+    return result;
														
 
															+}
														
 
															+
														
 
															 ////////////////////////////////////////////////////////////////////////////////
														
 
															 struct ggml_hash_set ggml_hash_set_new(size_t size) {
														
@@ -6454,6 +6593,16 @@ static void ggml_compute_backward(
 
															                         ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad));
														
 
															                     }
														
 
															                 } break;
														
 
															+                case GGML_UNARY_OP_EXPM1: {
														
 
															+                    if (src0_needs_grads) {
														
 
															+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_exp(ctx, src0)));
														
 
															+                    }
														
 
															+                } break;
														
 
															+                case GGML_UNARY_OP_SOFTPLUS: {
														
 
															+                    if (src0_needs_grads) {
														
 
															+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_sigmoid(ctx, src0)));
														
 
															+                    }
														
 
															+                } break;
														
 
															                 default: {
														
 
															                     fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n",
														
 
															                         __func__, ggml_unary_op_name(ggml_get_unary_op(tensor)));
														
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -175,6 +175,38 @@ static void init_tensor_kq_mask(ggml_tensor * tensor, float min = -1.0f, float m
 
															     ggml_backend_tensor_set(tensor, data_f16.data(), 0, data_f16.size()*sizeof(ggml_fp16_t));
														
 
															 }
														
 
															+// generate a lower triangular matrix
														
 
															+static void init_tensor_tril(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
														
 
															+    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
														
 
															+    GGML_ASSERT(tensor->ne[0] == tensor->ne[1]);
														
 
															+
														
 
															+    GGML_TENSOR_LOCALS(int32_t, ne, tensor, ne);
														
 
															+    GGML_TENSOR_LOCALS(size_t, nb, tensor, nb);
														
 
															+
														
 
															+    std::vector<float> data_f32(ne0*ne1*ne2*ne3);
														
 
															+
														
 
															+    std::random_device rd;
														
 
															+    std::mt19937 gen(rd());
														
 
															+    std::uniform_real_distribution<float> dis(min, max);
														
 
															+
														
 
															+    for (int64_t i3 = 0; i3 < ne3; i3++) {
														
 
															+        for (int64_t i2 = 0; i2 < ne2; i2++) {
														
 
															+            for (int64_t i1 = 0; i1 < ne1; i1++) {
														
 
															+                for (int64_t i0 = 0; i0 < ne0; i0++) {
														
 
															+                    int64_t idx = (i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3) / sizeof(float);
														
 
															+                    if (i0 <= i1) {
														
 
															+                        data_f32[idx] = dis(gen);
														
 
															+                    } else {
														
 
															+                        data_f32[idx] = 0.0f;
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    ggml_backend_tensor_set(tensor, data_f32.data(), 0, ggml_nbytes(tensor));
														
 
															+}
														
 
															+
														
 
															 static std::vector<float> tensor_to_float(const ggml_tensor * t) {
														
 
															     std::vector<float> tv;
														
 
															     tv.reserve(ggml_nelements(t));
														
@@ -1804,7 +1836,8 @@ struct test_unary : public test_case {
 
															     ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															         const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG ||
														
 
															-            op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU;
														
 
															+            op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU ||
														
 
															+            op == GGML_UNARY_OP_EXPM1 || op == GGML_UNARY_OP_SOFTPLUS;
														
 
															         ggml_tensor * a;
														
 
															         if (v & 1) {
														
@@ -2779,7 +2812,7 @@ struct test_bin_bcast : public test_case {
 
															     const std::array<int, 4> nr;
														
 
															     int nf; // number of fused ops, nf == 1 -> single op (no fusion)
														
 
															-    bool run_whole_graph() override { return true; }
														
 
															+    bool run_whole_graph() override { return nf > 1; }
														
 
															     std::string vars() override {
														
 
															         return VARS_TO_STR4(type, ne, nr, nf);
														
@@ -5395,6 +5428,7 @@ struct test_pad : public test_case {
 
															     }
														
 
															 };
														
 
															+// GGML_OP_PAD (with extension)
														
 
															 struct test_pad_ext : public test_case {
														
 
															     const ggml_type type;
														
 
															     const std::array<int64_t, 4> ne_a;
														
@@ -5802,6 +5836,7 @@ struct test_opt_step_adamw : public test_case {
 
															     }
														
 
															 };
														
 
															+// GGML_OP_OPT_STEP_SGD
														
 
															 struct test_opt_step_sgd : public test_case {
														
 
															     const ggml_type              type;
														
 
															     const std::array<int64_t, 4> ne;
														
@@ -5841,6 +5876,170 @@ struct test_opt_step_sgd : public test_case {
 
															     }
														
 
															 };
														
 
															+// GGML_OP_CUMSUM
														
 
															+struct test_cumsum : public test_case {
														
 
															+    const ggml_type              type;
														
 
															+    const std::array<int64_t, 4> ne;
														
 
															+
														
 
															+    std::string vars() override { return VARS_TO_STR2(type, ne); }
														
 
															+
														
 
															+    test_cumsum(ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
														
 
															+        : type(type), ne(ne) {}
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        ggml_tensor * out = ggml_cumsum(ctx, a);
														
 
															+
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+
														
 
															+    void initialize_tensors(ggml_context * ctx) override {
														
 
															+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
														
 
															+            init_tensor_uniform(t, -1.0f, 1.0f);
														
 
															+        }
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															+// GGML_OP_XIELU
														
 
															+struct test_xielu : public test_case {
														
 
															+    const ggml_type              type;
														
 
															+    const std::array<int64_t, 4> ne;
														
 
															+
														
 
															+    std::string vars() override { return VARS_TO_STR2(type, ne); }
														
 
															+
														
 
															+    test_xielu(ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
														
 
															+        : type(type), ne(ne) {}
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        float alpha_n = 4.0f;
														
 
															+        float alpha_p = 20.0f;
														
 
															+        float beta = 0.5f;
														
 
															+        float eps = 0.0000001f;
														
 
															+
														
 
															+        ggml_tensor * out = ggml_xielu(ctx, a, alpha_n, alpha_p, beta, eps);
														
 
															+
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+
														
 
															+    void initialize_tensors(ggml_context * ctx) override {
														
 
															+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
														
 
															+            init_tensor_uniform(t, -1.0f, 1.0f);
														
 
															+        }
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															+// GGML_OP_TRI
														
 
															+struct test_tri : public test_case {
														
 
															+    const ggml_type              type;
														
 
															+    const std::array<int64_t, 4> ne;
														
 
															+    const ggml_tri_type          tri_type;
														
 
															+
														
 
															+    std::string vars() override { return VARS_TO_STR3(type, ne, tri_type); }
														
 
															+
														
 
															+    test_tri(ggml_tri_type tri_type, ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne = { 10, 10, 4, 3 })
														
 
															+        : type(type), ne(ne), tri_type(tri_type) {
														
 
															+            GGML_ASSERT(ne[0] == ne[1]);
														
 
															+        }
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        ggml_tensor * out = ggml_tri(ctx, a, tri_type);
														
 
															+
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+
														
 
															+    void initialize_tensors(ggml_context * ctx) override {
														
 
															+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
														
 
															+            init_tensor_uniform(t, -1.0f, 1.0f);
														
 
															+        }
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															+// GGML_OP_FILL
														
 
															+struct test_fill : public test_case {
														
 
															+    const ggml_type              type;
														
 
															+    const std::array<int64_t, 4> ne;
														
 
															+    float                        c;
														
 
															+
														
 
															+    std::string vars() override { return VARS_TO_STR3(type, ne, c); }
														
 
															+
														
 
															+    test_fill(float c, ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne = { 10, 10, 4, 3 })
														
 
															+        : type(type), ne(ne), c(c) {}
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        ggml_tensor * out = ggml_fill(ctx, a, c);
														
 
															+
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															+// GGML_OP_SOLVE_TRI
														
 
															+struct test_solve_tri : public test_case {
														
 
															+    const ggml_type              type;
														
 
															+    const std::array<int64_t, 4> ne_lhs;
														
 
															+    const std::array<int64_t, 4> ne_rhs;
														
 
															+
														
 
															+    std::string vars() override { return VARS_TO_STR3(type, ne_lhs, ne_rhs); }
														
 
															+
														
 
															+    test_solve_tri(ggml_type type = GGML_TYPE_F32,
														
 
															+            std::array<int64_t, 4> ne_lhs = { 10, 10, 4, 3 },
														
 
															+            std::array<int64_t, 4> ne_rhs = { 3, 10, 4, 3 }
														
 
															+        )
														
 
															+        : type(type), ne_lhs(ne_lhs), ne_rhs(ne_rhs) {}
														
 
															+
														
 
															+    ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															+        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne_lhs[0], ne_lhs[1], ne_lhs[2], ne_lhs[3]);
														
 
															+        ggml_set_param(a);
														
 
															+        ggml_set_name(a, "a");
														
 
															+
														
 
															+        ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne_rhs[0], ne_rhs[1], ne_rhs[2], ne_rhs[3]);
														
 
															+        ggml_set_param(b);
														
 
															+        ggml_set_name(b, "b");
														
 
															+
														
 
															+        ggml_tensor * out = ggml_solve_tri(ctx, a, b, true, true, false);
														
 
															+        ggml_set_name(out, "out");
														
 
															+
														
 
															+        return out;
														
 
															+    }
														
 
															+
														
 
															+    void initialize_tensors(ggml_context * ctx) override {
														
 
															+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
														
 
															+            if (strcmp(t->name, "a") == 0) {
														
 
															+                // note: avoid zeros in the diagonal
														
 
															+                init_tensor_tril(t, 0.1, 1.0f);
														
 
															+            } else {
														
 
															+                init_tensor_uniform(t, -1.0f, 1.0f);
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+};
														
 
															+
														
 
															 enum llm_norm_type {
														
 
															     LLM_NORM,
														
 
															     LLM_NORM_RMS,
														
@@ -6282,6 +6481,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
															     for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
														
 
															         for (int v : {0, 1}) {
														
 
															             for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
														
 
															+                if (op == GGML_UNARY_OP_XIELU) {
														
 
															+                    continue; // need extra params, separate test
														
 
															+                }
														
 
															                 test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 128, 2, 2, 2 }, v));
														
 
															                 test_cases.emplace_back(new test_unary((ggml_unary_op) op, type, { 5, 7, 11, 13 }, v));
														
 
															             }
														
@@ -7339,6 +7541,26 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
															     test_cases.emplace_back(new test_arange());
														
 
															     test_cases.emplace_back(new test_timestep_embedding());
														
 
															     test_cases.emplace_back(new test_leaky_relu());
														
 
															+    test_cases.emplace_back(new test_cumsum());
														
 
															+
														
 
															+    test_cases.emplace_back(new test_xielu());
														
 
															+
														
 
															+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER));
														
 
															+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER_DIAG));
														
 
															+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER));
														
 
															+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG));
														
 
															+
														
 
															+    test_cases.emplace_back(new test_fill(0.0f));
														
 
															+    test_cases.emplace_back(new test_fill(2.0f, GGML_TYPE_F32, { 303, 207, 11, 3 }));
														
 
															+    test_cases.emplace_back(new test_fill(-152.0f, GGML_TYPE_F32, { 800, 600, 4, 4 }));
														
 
															+
														
 
															+    test_cases.emplace_back(new test_solve_tri());
														
 
															+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 11, 11, 1, 1 }, { 5, 11, 1, 1 }));
														
 
															+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 17, 17, 2, 4 }, { 9, 17, 2, 4 }));
														
 
															+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
														
 
															+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
														
 
															+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
														
 
															+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
														
 
															     for (bool v : {false, true}) {
														
 
															         test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v));