пре 1 година · 432df2d5f9
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1500,7 +1500,7 @@ extern "C" {
 
															     // rotary position embedding backward, i.e compute dx from dy
														
 
															     // a - dy
														
 
															-    GGML_API struct ggml_tensor * ggml_rope_back(
														
 
															+    GGML_API struct ggml_tensor * ggml_rope_ext_back(
														
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * a, // gradients of ggml_rope result
														
 
															             struct ggml_tensor  * b, // positions
														
@@ -1515,6 +1515,23 @@ extern "C" {
 
															             float                 beta_fast,
														
 
															             float                 beta_slow);
														
 
															+    GGML_API struct ggml_tensor * ggml_rope_multi_back(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a,
														
 
															+            struct ggml_tensor  * b,
														
 
															+            struct ggml_tensor  * c,
														
 
															+            int                   n_dims,
														
 
															+            int                   sections[4],
														
 
															+            int                   mode,
														
 
															+            int                   n_ctx_orig,
														
 
															+            float                 freq_base,
														
 
															+            float                 freq_scale,
														
 
															+            float                 ext_factor,
														
 
															+            float                 attn_factor,
														
 
															+            float                 beta_fast,
														
 
															+            float                 beta_slow);
														
 
															+
														
 
															+
														
 
															     // clamp
														
 
															     // in-place, returns view(a)
														
 
															     GGML_API struct ggml_tensor * ggml_clamp(
														
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -13668,6 +13668,7 @@ struct ggml_cplan ggml_graph_plan(
 
															                     } break;
														
 
															                 case GGML_OP_SOFT_MAX:
														
 
															                 case GGML_OP_ROPE:
														
 
															+                case GGML_OP_ROPE_BACK:
														
 
															                     {
														
 
															                         cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
														
 
															                     } break;
														
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -403,8 +403,6 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
 
															                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
														
 
															         case GGML_OP_MUL_MAT:
														
 
															             return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
														
 
															-        case GGML_OP_ROPE_BACK:
														
 
															-            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
														
 
															         case GGML_OP_IM2COL_BACK:
														
 
															             return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
														
 
															         case GGML_OP_OUT_PROD:
														
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2141,6 +2141,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
															         case GGML_OP_ROPE:
														
 
															             ggml_cuda_op_rope(ctx, dst);
														
 
															             break;
														
 
															+        case GGML_OP_ROPE_BACK:
														
 
															+            ggml_cuda_op_rope_back(ctx, dst);
														
 
															+            break;
														
 
															         case GGML_OP_IM2COL:
														
 
															             ggml_cuda_op_im2col(ctx, dst);
														
 
															             break;
														
@@ -3025,7 +3028,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 
															         case GGML_OP_SOFT_MAX:
														
 
															             return true;
														
 
															         case GGML_OP_ROPE:
														
 
															-            return ggml_is_contiguous(op->src[0]);
														
 
															+        case GGML_OP_ROPE_BACK: {
														
 
															+            const size_t ts = ggml_type_size(op->src[0]->type);
														
 
															+            const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
														
 
															+            return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
														
 
															+        }
														
 
															         case GGML_OP_IM2COL:
														
 
															         case GGML_OP_POOL_2D:
														
 
															         case GGML_OP_SUM:
														
@@ -3081,6 +3088,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 
															             return op->ne[1];
														
 
															         case GGML_OP_MUL_MAT_ID:
														
 
															         case GGML_OP_ROPE:
														
 
															+        case GGML_OP_ROPE_BACK:
														
 
															             return op->ne[2];
														
 
															         default:
														
 
															             return ggml_nrows(op);
														
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -16,9 +16,10 @@ static __device__ float rope_yarn_ramp(const float low, const float high, const
 
															 // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
														
 
															 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
														
 
															+template<bool forward>
														
 
															 static __device__ void rope_yarn(
														
 
															-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
														
 
															-    float * cos_theta, float * sin_theta) {
														
 
															+        const float theta_extrap, const float freq_scale, const rope_corr_dims corr_dims, const int64_t i0, const float ext_factor,
														
 
															+        float mscale, float & cos_theta, float & sin_theta) {
														
 
															     // Get n-d rotational scaling corrected for extrapolation
														
 
															     float theta_interp = freq_scale * theta_extrap;
														
 
															     float theta = theta_interp;
														
@@ -29,24 +30,28 @@ static __device__ void rope_yarn(
 
															         // Get n-d magnitude scaling corrected for interpolation
														
 
															         mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
														
 
															     }
														
 
															-    *cos_theta = cosf(theta) * mscale;
														
 
															-    *sin_theta = sinf(theta) * mscale;
														
 
															+    cos_theta = cosf(theta) * mscale;
														
 
															+    sin_theta = sinf(theta) * mscale;
														
 
															+    if (!forward) {
														
 
															+        sin_theta *= -1.0f;
														
 
															+    }
														
 
															 }
														
 
															-template<typename T, bool has_ff>
														
 
															+template<bool forward, bool has_ff, typename T>
														
 
															 static __global__ void rope_norm(
														
 
															-    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
														
 
															+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
														
 
															+        const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
														
 
															+        const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) {
														
 
															     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
														
 
															     if (i0 >= ne0) {
														
 
															         return;
														
 
															     }
														
 
															-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															     if (i0 >= n_dims) {
														
 
															-        const int i = row*ne0 + i0;
														
 
															+        const int i = row_dst*ne0 + i0;
														
 
															         dst[i + 0] = x[i + 0];
														
 
															         dst[i + 1] = x[i + 1];
														
@@ -54,39 +59,43 @@ static __global__ void rope_norm(
 
															         return;
														
 
															     }
														
 
															-    const int i  = row*ne0 + i0;
														
 
															-    const int i2 = row/p_delta_rows;
														
 
															+    const int row_x     = row_dst % ne1;
														
 
															+    const int channel_x = row_dst / ne1;
														
 
															+
														
 
															+    const int idst = row_dst*ne0 + i0;
														
 
															+    const int ix   = channel_x*s2 + row_x*s1 + i0;
														
 
															-    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
														
 
															+    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
														
 
															     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
														
 
															     float cos_theta;
														
 
															     float sin_theta;
														
 
															-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
														
 
															+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
														
 
															-    const float x0 = x[i + 0];
														
 
															-    const float x1 = x[i + 1];
														
 
															+    const float x0 = x[ix + 0];
														
 
															+    const float x1 = x[ix + 1];
														
 
															-    dst[i + 0] = x0*cos_theta - x1*sin_theta;
														
 
															-    dst[i + 1] = x0*sin_theta + x1*cos_theta;
														
 
															+    dst[idst + 0] = x0*cos_theta - x1*sin_theta;
														
 
															+    dst[idst + 1] = x0*sin_theta + x1*cos_theta;
														
 
															 }
														
 
															-template<typename T, bool has_ff>
														
 
															+template<bool forward, bool has_ff, typename T>
														
 
															 static __global__ void rope_neox(
														
 
															-    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
														
 
															+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
														
 
															+        const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
														
 
															+        const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) {
														
 
															     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
														
 
															     if (i0 >= ne0) {
														
 
															         return;
														
 
															     }
														
 
															-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															     if (i0 >= n_dims) {
														
 
															-        const int i = row*ne0 + i0;
														
 
															+        const int i = row_dst*ne0 + i0;
														
 
															         dst[i + 0] = x[i + 0];
														
 
															         dst[i + 1] = x[i + 1];
														
@@ -94,39 +103,43 @@ static __global__ void rope_neox(
 
															         return;
														
 
															     }
														
 
															-    const int i  = row*ne0 + i0/2;
														
 
															-    const int i2 = row/p_delta_rows;
														
 
															+    const int row_x     = row_dst % ne1;
														
 
															+    const int channel_x = row_dst / ne1;
														
 
															+
														
 
															+    const int idst = row_dst*ne0 + i0/2;
														
 
															+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
														
 
															-    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
														
 
															+    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
														
 
															     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
														
 
															     float cos_theta;
														
 
															     float sin_theta;
														
 
															-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
														
 
															+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
														
 
															-    const float x0 = x[i + 0];
														
 
															-    const float x1 = x[i + n_dims/2];
														
 
															+    const float x0 = x[ix + 0];
														
 
															+    const float x1 = x[ix + n_dims/2];
														
 
															-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
														
 
															-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
														
 
															+    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
														
 
															+    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
														
 
															 }
														
 
															-template<typename T, bool has_ff>
														
 
															+template<bool forward, bool has_ff, typename T>
														
 
															 static __global__ void rope_multi(
														
 
															-    const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
														
 
															+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
														
 
															+        const int n_dims, const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
														
 
															+        const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) {
														
 
															     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
														
 
															     if (i0 >= ne0) {
														
 
															         return;
														
 
															     }
														
 
															-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															     if (i0 >= n_dims) {
														
 
															-        const int i = row*ne0 + i0;
														
 
															+        const int i = row_dst*ne0 + i0;
														
 
															         dst[i + 0] = x[i + 0];
														
 
															         dst[i + 1] = x[i + 1];
														
@@ -134,25 +147,28 @@ static __global__ void rope_multi(
 
															         return;
														
 
															     }
														
 
															-    const int i  = row*ne0 + i0/2;
														
 
															-    const int i2 = row/p_delta_rows;
														
 
															+    const int row_x     = row_dst % ne1;
														
 
															+    const int channel_x = row_dst / ne1;
														
 
															-    int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
														
 
															-    int sec_w = sections.v[1] + sections.v[0];
														
 
															-    int sector = (i0 / 2) % sect_dims;
														
 
															+    const int idst = row_dst*ne0 + i0/2;
														
 
															+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
														
 
															+
														
 
															+    const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
														
 
															+    const int sec_w = sections.v[1] + sections.v[0];
														
 
															+    const int sector = (i0 / 2) % sect_dims;
														
 
															     float theta_base = 0.0;
														
 
															     if (sector < sections.v[0]) {
														
 
															-        theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
														
 
															+        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
														
 
															     }
														
 
															     else if (sector >= sections.v[0] && sector < sec_w) {
														
 
															-        theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
														
 
															+        theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
														
 
															     }
														
 
															     else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
														
 
															-        theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
														
 
															+        theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
														
 
															     }
														
 
															     else if (sector >= sec_w + sections.v[2]) {
														
 
															-        theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
														
 
															+        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
														
 
															     }
														
 
															     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
														
@@ -160,42 +176,46 @@ static __global__ void rope_multi(
 
															     float cos_theta;
														
 
															     float sin_theta;
														
 
															-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
														
 
															+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
														
 
															-    const float x0 = x[i + 0];
														
 
															-    const float x1 = x[i + n_dims/2];
														
 
															+    const float x0 = x[ix + 0];
														
 
															+    const float x1 = x[ix + n_dims/2];
														
 
															-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
														
 
															-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
														
 
															+    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
														
 
															+    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
														
 
															 }
														
 
															-template<typename T, bool has_ff>
														
 
															+template<bool forward, bool has_ff, typename T>
														
 
															 static __global__ void rope_vision(
														
 
															-    const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
														
 
															+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
														
 
															+        const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
														
 
															+        const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) {
														
 
															     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
														
 
															     if (i0 >= ne0) {
														
 
															         return;
														
 
															     }
														
 
															-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															+
														
 
															+    const int row_x     = row_dst % ne1;
														
 
															+    const int channel_x = row_dst / ne1;
														
 
															-    const int i  = row*ne0 + i0/2;
														
 
															-    const int i2 = row/p_delta_rows; // i2-th tokens
														
 
															+    const int idst = row_dst*ne0 + i0/2;
														
 
															+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
														
 
															-    int sect_dims = sections.v[0] + sections.v[1];
														
 
															-    int sec_w = sections.v[1] + sections.v[0];
														
 
															-    int sector = (i0 / 2) % sect_dims;
														
 
															+    const int sect_dims = sections.v[0] + sections.v[1];
														
 
															+    const int sec_w = sections.v[1] + sections.v[0];
														
 
															+    const int sector = (i0 / 2) % sect_dims;
														
 
															     float theta_base = 0.0;
														
 
															     if (sector < sections.v[0]) {
														
 
															         const int p = sector;
														
 
															-        theta_base = pos[i2]*powf(theta_scale, p);
														
 
															+        theta_base = pos[channel_x]*powf(theta_scale, p);
														
 
															     }
														
 
															     else if (sector >= sections.v[0] && sector < sec_w) {
														
 
															         const int p = sector - sections.v[0];
														
 
															-        theta_base = pos[i2 + ne2]*powf(theta_scale, p);
														
 
															+        theta_base = pos[channel_x + ne2]*powf(theta_scale, p);
														
 
															     }
														
 
															     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
														
@@ -203,19 +223,20 @@ static __global__ void rope_vision(
 
															     float cos_theta;
														
 
															     float sin_theta;
														
 
															-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
														
 
															+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
														
 
															-    const float x0 = x[i + 0];
														
 
															-    const float x1 = x[i + n_dims];
														
 
															+    const float x0 = x[ix + 0];
														
 
															+    const float x1 = x[ix + n_dims];
														
 
															-    dst[i + 0]      = x0*cos_theta - x1*sin_theta;
														
 
															-    dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
														
 
															+    dst[idst + 0]      = x0*cos_theta - x1*sin_theta;
														
 
															+    dst[idst + n_dims] = x0*sin_theta + x1*cos_theta;
														
 
															 }
														
 
															-template<typename T>
														
 
															+template<bool forward, typename T>
														
 
															 static void rope_norm_cuda(
														
 
															-    const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
														
 
															+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
														
 
															+        const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
														
 
															+        const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) {
														
 
															     GGML_ASSERT(ne0 % 2 == 0);
														
 
															     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
														
 
															     const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
														
@@ -224,22 +245,21 @@ static void rope_norm_cuda(
 
															     const float theta_scale = powf(freq_base, -2.0f/n_dims);
														
 
															     if (freq_factors == nullptr) {
														
 
															-        rope_norm<T, false><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
														
 
															-                theta_scale, freq_factors
														
 
															-                );
														
 
															+        rope_norm<forward, false><<<block_nums, block_dims, 0, stream>>>(
														
 
															+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
														
 
															+            attn_factor, corr_dims, theta_scale, freq_factors);
														
 
															     } else {
														
 
															-        rope_norm<T, true><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
														
 
															-                theta_scale, freq_factors
														
 
															-                );
														
 
															+        rope_norm<forward, true><<<block_nums, block_dims, 0, stream>>>(
														
 
															+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
														
 
															+            attn_factor, corr_dims, theta_scale, freq_factors);
														
 
															     }
														
 
															 }
														
 
															-template<typename T>
														
 
															+template<bool forward, typename T>
														
 
															 static void rope_neox_cuda(
														
 
															-    const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
														
 
															+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
														
 
															+        const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
														
 
															+        const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) {
														
 
															     GGML_ASSERT(ne0 % 2 == 0);
														
 
															     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
														
 
															     const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
														
@@ -248,22 +268,21 @@ static void rope_neox_cuda(
 
															     const float theta_scale = powf(freq_base, -2.0f/n_dims);
														
 
															     if (freq_factors == nullptr) {
														
 
															-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
														
 
															-                theta_scale, freq_factors
														
 
															-                );
														
 
															+        rope_neox<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
														
 
															+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
														
 
															+            attn_factor, corr_dims, theta_scale, freq_factors);
														
 
															     } else {
														
 
															-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
														
 
															-                theta_scale, freq_factors
														
 
															-                );
														
 
															+        rope_neox<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
														
 
															+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
														
 
															+            attn_factor, corr_dims, theta_scale, freq_factors);
														
 
															     }
														
 
															 }
														
 
															-template<typename T>
														
 
															+template<bool forward, typename T>
														
 
															 static void rope_multi_cuda(
														
 
															-    const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
														
 
															+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
														
 
															+        const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
														
 
															+        const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) {
														
 
															     GGML_ASSERT(ne0 % 2 == 0);
														
 
															     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
														
 
															     const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
														
@@ -272,22 +291,21 @@ static void rope_multi_cuda(
 
															     const float theta_scale = powf(freq_base, -2.0f/n_dims);
														
 
															     if (freq_factors == nullptr) {
														
 
															-        rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
														
 
															-                theta_scale, freq_factors, sections
														
 
															-                );
														
 
															+        rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
														
 
															+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
														
 
															+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
														
 
															     } else {
														
 
															-        rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
														
 
															-                theta_scale, freq_factors, sections
														
 
															-                );
														
 
															+        rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
														
 
															+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
														
 
															+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
														
 
															     }
														
 
															 }
														
 
															-template<typename T>
														
 
															+template<bool forward, typename T>
														
 
															 static void rope_vision_cuda(
														
 
															-    const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
														
 
															+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
														
 
															+        const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
														
 
															+        const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) {
														
 
															     GGML_ASSERT(ne0 % 2 == 0);
														
 
															     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
														
 
															     const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
														
@@ -298,80 +316,18 @@ static void rope_vision_cuda(
 
															     const float theta_scale = powf(freq_base, -2.0f/n_dims);
														
 
															     if (freq_factors == nullptr) {
														
 
															-        rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
														
 
															-                theta_scale, freq_factors, sections
														
 
															-                );
														
 
															+        rope_vision<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
														
 
															+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
														
 
															+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
														
 
															     } else {
														
 
															-        rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
														
 
															-                theta_scale, freq_factors, sections
														
 
															-                );
														
 
															+        rope_vision<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
														
 
															+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
														
 
															+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
														
 
															     }
														
 
															 }
														
 
															-static void rope_norm_cuda_f16(
														
 
															-    const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
														
 
															-
														
 
															-    rope_norm_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
														
 
															-}
														
 
															-
														
 
															-static void rope_norm_cuda_f32(
														
 
															-    const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
														
 
															-
														
 
															-    rope_norm_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
														
 
															-}
														
 
															-
														
 
															-static void rope_neox_cuda_f16(
														
 
															-    const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
														
 
															-
														
 
															-    rope_neox_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
														
 
															-}
														
 
															-
														
 
															-static void rope_neox_cuda_f32(
														
 
															-    const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
														
 
															-) {
														
 
															-
														
 
															-    rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
														
 
															-}
														
 
															-
														
 
															-static void rope_multi_cuda_f16(
														
 
															-    const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
														
 
															-) {
														
 
															-
														
 
															-    rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
														
 
															-}
														
 
															-
														
 
															-static void rope_multi_cuda_f32(
														
 
															-    const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
														
 
															-) {
														
 
															-
														
 
															-    rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
														
 
															-}
														
 
															-
														
 
															-static void rope_vision_cuda_f16(
														
 
															-    const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
														
 
															-) {
														
 
															-
														
 
															-    rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
														
 
															-}
														
 
															-
														
 
															-static void rope_vision_cuda_f32(
														
 
															-    const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
														
 
															-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
														
 
															-) {
														
 
															-
														
 
															-    rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															+template <bool forward>
														
 
															+void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															     const ggml_tensor * src0 = dst->src[0];
														
 
															     const ggml_tensor * src1 = dst->src[1];
														
 
															     const ggml_tensor * src2 = dst->src[2];
														
@@ -382,7 +338,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
															     float * dst_d = (float *)dst->data;
														
 
															     cudaStream_t stream = ctx.stream();
														
 
															-    GGML_ASSERT(ggml_is_contiguous(src0));
														
 
															     GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
														
 
															     GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
														
 
															     GGML_ASSERT(src0->type == dst->type);
														
@@ -392,6 +347,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
															     const int64_t ne02 = src0->ne[2]; // num heads
														
 
															     const int64_t nr = ggml_nrows(src0);
														
 
															+    const size_t s01 = src0->nb[1] / ggml_type_size(src0->type);
														
 
															+    const size_t s02 = src0->nb[2] / ggml_type_size(src0->type);
														
 
															+
														
 
															     //const int n_past     = ((int32_t *) dst->op_params)[0];
														
 
															     const int n_dims     = ((int32_t *) dst->op_params)[1];
														
 
															     const int mode       = ((int32_t *) dst->op_params)[2];
														
@@ -440,59 +398,59 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
															     // compute
														
 
															     if (is_neox) {
														
 
															         if (src0->type == GGML_TYPE_F32) {
														
 
															-            rope_neox_cuda_f32(
														
 
															-                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															-                attn_factor, corr_dims, freq_factors, stream
														
 
															-            );
														
 
															+            rope_neox_cuda<forward>(
														
 
															+                (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
														
 
															+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
														
 
															         } else if (src0->type == GGML_TYPE_F16) {
														
 
															-            rope_neox_cuda_f16(
														
 
															-                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															-                attn_factor, corr_dims, freq_factors, stream
														
 
															-            );
														
 
															+            rope_neox_cuda<forward>(
														
 
															+                (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
														
 
															+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
														
 
															         } else {
														
 
															             GGML_ABORT("fatal error");
														
 
															         }
														
 
															     } else if (is_mrope && !is_vision) {
														
 
															         if (src0->type == GGML_TYPE_F32) {
														
 
															-            rope_multi_cuda_f32(
														
 
															-                (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															-                attn_factor, corr_dims, freq_factors, sections, stream
														
 
															-            );
														
 
															+            rope_multi_cuda<forward>(
														
 
															+                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
														
 
															+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
														
 
															         } else if (src0->type == GGML_TYPE_F16) {
														
 
															-            rope_multi_cuda_f16(
														
 
															-                (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															-                attn_factor, corr_dims, freq_factors, sections, stream
														
 
															-            );
														
 
															+            rope_multi_cuda<forward>(
														
 
															+                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
														
 
															+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
														
 
															         } else {
														
 
															             GGML_ABORT("fatal error");
														
 
															         }
														
 
															     } else if (is_vision) {
														
 
															         if (src0->type == GGML_TYPE_F32) {
														
 
															-            rope_vision_cuda_f32(
														
 
															-                (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															-                attn_factor, corr_dims, freq_factors, sections, stream
														
 
															-            );
														
 
															+            rope_vision_cuda<forward>(
														
 
															+                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
														
 
															+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
														
 
															         } else if (src0->type == GGML_TYPE_F16) {
														
 
															-            rope_vision_cuda_f16(
														
 
															-                (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															-                attn_factor, corr_dims, freq_factors, sections, stream
														
 
															-            );
														
 
															+            rope_vision_cuda<forward>(
														
 
															+                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
														
 
															+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
														
 
															         } else {
														
 
															             GGML_ABORT("fatal error");
														
 
															         }
														
 
															     } else {
														
 
															         if (src0->type == GGML_TYPE_F32) {
														
 
															-            rope_norm_cuda_f32(
														
 
															-                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															-                attn_factor, corr_dims, freq_factors, stream
														
 
															-            );
														
 
															+            rope_norm_cuda<forward>(
														
 
															+                (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
														
 
															+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
														
 
															         } else if (src0->type == GGML_TYPE_F16) {
														
 
															-            rope_norm_cuda_f16(
														
 
															-                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
														
 
															-                attn_factor, corr_dims, freq_factors, stream
														
 
															-            );
														
 
															+            rope_norm_cuda<forward>(
														
 
															+                (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
														
 
															+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
														
 
															         } else {
														
 
															             GGML_ABORT("fatal error");
														
 
															         }
														
 
															     }
														
 
															 }
														
 
															+
														
 
															+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															+    ggml_cuda_op_rope_impl<true>(ctx, dst);
														
 
															+}
														
 
															+
														
 
															+void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															+    ggml_cuda_op_rope_impl<false>(ctx, dst);
														
 
															+}
														
--- a/ggml/src/ggml-cuda/rope.cuh
+++ b/ggml/src/ggml-cuda/rope.cuh
@@ -3,3 +3,5 @@
 
															 #define CUDA_ROPE_BLOCK_SIZE 256
														
 
															 void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+
														
 
															+void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3695,7 +3695,7 @@ void ggml_rope_yarn_corr_dims(
 
															 // ggml_rope_back
														
 
															-struct ggml_tensor * ggml_rope_back(
														
 
															+struct ggml_tensor * ggml_rope_ext_back(
														
 
															         struct ggml_context * ctx,
														
 
															         struct ggml_tensor  * a,
														
 
															         struct ggml_tensor  * b,
														
@@ -3709,29 +3709,32 @@ struct ggml_tensor * ggml_rope_back(
 
															         float                 attn_factor,
														
 
															         float                 beta_fast,
														
 
															         float                 beta_slow) {
														
 
															-    GGML_ASSERT(ggml_is_vector(b));
														
 
															-    GGML_ASSERT(b->type == GGML_TYPE_I32);
														
 
															-    GGML_ASSERT(a->ne[2] == b->ne[0]);
														
 
															-
														
 
															-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
														
 
															-
														
 
															-    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
														
 
															-    memcpy(params +  5, &freq_base,    sizeof(float));
														
 
															-    memcpy(params +  6, &freq_scale,   sizeof(float));
														
 
															-    memcpy(params +  7, &ext_factor,   sizeof(float));
														
 
															-    memcpy(params +  8, &attn_factor,  sizeof(float));
														
 
															-    memcpy(params +  9, &beta_fast,    sizeof(float));
														
 
															-    memcpy(params + 10, &beta_slow,    sizeof(float));
														
 
															-    ggml_set_op_params(result, params, sizeof(params));
														
 
															-
														
 
															-    result->op     = GGML_OP_ROPE_BACK;
														
 
															-    result->src[0] = a;
														
 
															-    result->src[1] = b;
														
 
															-    result->src[2] = c;
														
 
															-
														
 
															+    struct ggml_tensor * result = ggml_rope_ext(
														
 
															+        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
														
 
															+    result->op = GGML_OP_ROPE_BACK;
														
 
															     return result;
														
 
															 }
														
 
															+struct ggml_tensor * ggml_rope_multi_back(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a,
														
 
															+        struct ggml_tensor  * b,
														
 
															+        struct ggml_tensor  * c,
														
 
															+        int                   n_dims,
														
 
															+        int                   sections[4],
														
 
															+        int                   mode,
														
 
															+        int                   n_ctx_orig,
														
 
															+        float                 freq_base,
														
 
															+        float                 freq_scale,
														
 
															+        float                 ext_factor,
														
 
															+        float                 attn_factor,
														
 
															+        float                 beta_fast,
														
 
															+        float                 beta_slow) {
														
 
															+    struct ggml_tensor * result = ggml_rope_multi(
														
 
															+        ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
														
 
															+    result->op = GGML_OP_ROPE_BACK;
														
 
															+    return result;
														
 
															+}
														
 
															 // ggml_clamp
														
 
															 struct ggml_tensor * ggml_clamp(
														
@@ -5594,6 +5597,7 @@ static void ggml_compute_backward(
 
															                 //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
														
 
															                 const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
														
 
															                 float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
														
 
															+                int sections[4] = {0, 0, 0, 0};
														
 
															                 memcpy(&freq_base,   (const float *) tensor->op_params +  5, sizeof(float));
														
 
															                 memcpy(&freq_scale,  (const float *) tensor->op_params +  6, sizeof(float));
														
@@ -5601,10 +5605,14 @@ static void ggml_compute_backward(
 
															                 memcpy(&attn_factor, (const float *) tensor->op_params +  8, sizeof(float));
														
 
															                 memcpy(&beta_fast,   (const float *) tensor->op_params +  9, sizeof(float));
														
 
															                 memcpy(&beta_slow,   (const float *) tensor->op_params + 10, sizeof(float));
														
 
															-
														
 
															-                ggml_add_or_set(ctx, cgraph, isrc0,
														
 
															-                    ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base,
														
 
															-                        freq_scale, ext_factor, attn_factor, beta_fast, beta_slow));
														
 
															+                memcpy(&sections,                    tensor->op_params + 11, sizeof(sections));
														
 
															+
														
 
															+                struct ggml_tensor * rope_back = grad->ne[2] == src1->ne[0] ?
														
 
															+                    ggml_rope_ext_back(ctx, grad, src1, src2, n_dims,
														
 
															+                        mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) :
														
 
															+                    ggml_rope_multi_back(ctx, grad, src1, src2, n_dims, sections,
														
 
															+                        mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
														
 
															+                ggml_add_or_set(ctx, cgraph, isrc0, rope_back);
														
 
															             }
														
 
															             GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
														
 
															         } break;
														
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4642,7 +4642,7 @@ struct llm_build_context {
 
															                     0);
														
 
															                 cb(v_states, "v_states", il);
														
 
															-                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
														
 
															+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
														
 
															                 q_pe = ggml_rope_ext(
														
 
															                     ctx0, q_pe, inp_pos, rope_factors,
														
 
															                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
														
@@ -4651,7 +4651,7 @@ struct llm_build_context {
 
															                 cb(q_pe, "q_pe", il);
														
 
															                 // shared RoPE key
														
 
															-                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
														
 
															+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
														
 
															                 k_pe = ggml_rope_ext(
														
 
															                     ctx0, k_pe, inp_pos, rope_factors,
														
 
															                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
														
@@ -6496,7 +6496,7 @@ struct llm_build_context {
 
															                     0);
														
 
															                 cb(v_states, "v_states", il);
														
 
															-                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
														
 
															+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
														
 
															                 q_pe = ggml_rope_ext(
														
 
															                     ctx0, q_pe, inp_pos, nullptr,
														
 
															                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
														
@@ -6505,7 +6505,7 @@ struct llm_build_context {
 
															                 cb(q_pe, "q_pe", il);
														
 
															                 // shared RoPE key
														
 
															-                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
														
 
															+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
														
 
															                 k_pe = ggml_rope_ext(
														
 
															                     ctx0, k_pe, inp_pos, nullptr,
														
 
															                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
														
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2192,7 +2192,7 @@ struct test_soft_max : public test_case {
 
															 };
														
 
															-// GGML_OP_ROPE
														
 
															+// GGML_OP_ROPE + GGML_OP_ROPE_BACK
														
 
															 struct test_rope : public test_case {
														
 
															     const ggml_type type;
														
 
															     const std::array<int64_t, 4> ne_a;
														
@@ -2204,29 +2204,36 @@ struct test_rope : public test_case {
 
															     float af; // attn_factor
														
 
															     bool ff;
														
 
															     int v; // view (1 : non-contiguous a)
														
 
															+    bool forward;
														
 
															     std::string vars() override {
														
 
															+        // forward can be inferred from the op, does not need to be printed
														
 
															         return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
														
 
															     }
														
 
															     test_rope(ggml_type type = GGML_TYPE_F32,
														
 
															             std::array<int64_t, 4> ne_a = {10, 5, 3, 1},
														
 
															-            int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
														
 
															-        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
														
 
															+            int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f,
														
 
															+            float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0, bool forward = true)
														
 
															+        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v), forward(forward) {}
														
 
															     ggml_tensor * build_graph(ggml_context * ctx) override {
														
 
															         ggml_tensor * a;
														
 
															         if (v & 1) {
														
 
															             auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
														
 
															             a = ggml_new_tensor(ctx, type, 4, ne.data());
														
 
															-            ggml_set_param(ctx, a);
														
 
															+            if (forward) {
														
 
															+                ggml_set_param(ctx, a);
														
 
															+            }
														
 
															             ggml_set_name(a, "a");
														
 
															             a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
														
 
															             ggml_set_name(a, "view_of_a");
														
 
															         } else {
														
 
															             a = ggml_new_tensor(ctx, type, 4, ne_a.data());
														
 
															-            ggml_set_param(ctx, a);
														
 
															+            if (forward) {
														
 
															+                ggml_set_param(ctx, a);
														
 
															+            }
														
 
															             ggml_set_name(a, "a");
														
 
															         }
														
@@ -2252,14 +2259,26 @@ struct test_rope : public test_case {
 
															             if (is_vision) {
														
 
															                 GGML_ASSERT(n_dims/4 > 0);
														
 
															                 int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
														
 
															-                out = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+                if (forward) {
														
 
															+                    out = ggml_rope_multi     (ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+                } else {
														
 
															+                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+                }
														
 
															             } else {
														
 
															                 GGML_ASSERT(n_dims/3 > 0);
														
 
															                 int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
														
 
															-                out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+                if (forward) {
														
 
															+                    out = ggml_rope_multi     (ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+                } else {
														
 
															+                    out = ggml_rope_multi_back(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+                }
														
 
															             }
														
 
															         } else {
														
 
															-            out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+            if (forward) {
														
 
															+                out = ggml_rope_ext     (ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+            } else {
														
 
															+                out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
														
 
															+            }
														
 
															         }
														
 
															         ggml_set_name(out, "out");
														
@@ -3844,7 +3863,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
															     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 0.0f));
														
 
															     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 8.0f));
														
 
															-    {
														
 
															+    for (bool fw : {true, false}) { // fw == forward
														
 
															         bool all = true;
														
 
															         for (float v : { 0, 1 }) {
														
@@ -3853,29 +3872,29 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
															                     for (float af : { 1.0f, 1.4245f }) {
														
 
															                         for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
														
 
															                             for (bool ff : {false, true}) { // freq_factors
														
 
															-                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
														
 
															+                                test_cases.emplace_back(new test_rope(type, {128,  32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 7B
														
 
															                                 if (all) {
														
 
															-                                    test_cases.emplace_back(new test_rope(type, {128,  40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
														
 
															-                                    test_cases.emplace_back(new test_rope(type, {128,  52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
														
 
															-                                    test_cases.emplace_back(new test_rope(type, {128,  64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
														
 
															+                                    test_cases.emplace_back(new test_rope(type, {128,  40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 13B
														
 
															+                                    test_cases.emplace_back(new test_rope(type, {128,  52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 30B
														
 
															+                                    test_cases.emplace_back(new test_rope(type, {128,  64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v, fw)); // llama 65B
														
 
															                                 }
														
 
															                                 if (all) {
														
 
															-                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 2, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
														
 
															-                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 2, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
														
 
															-                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 2, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
														
 
															-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
														
 
															-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
														
 
															+                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
														
 
															+                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 7B)
														
 
															+                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
														
 
															+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  20, 2, 512, fs, ef, af, ff, v, fw)); // neox (stablelm)
														
 
															+                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 2, 1},  32, 2, 512, fs, ef, af, ff, v, fw)); // neox (phi-2)
														
 
															                                 }
														
 
															                                 if (all) {
														
 
															-                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 2B)
														
 
															-                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 7B)
														
 
															-                                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl ViT)
														
 
															+                                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
														
 
															+                                    test_cases.emplace_back(new test_rope(type, {128,  28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl 7B)
														
 
															+                                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
														
 
															                                 }
														
 
															-                                test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
														
 
															+                                test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1},  64, 2, 512, fs, ef, af, ff, v, fw)); // neox (falcon 40B)
														
 
															                             }
														
 
															                         }