hace 2 meses · 73460f6278
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5503,194 +5503,28 @@ static void ggml_mrope_cache_init(
 
															     }
														
 
															 }
														
 
															-static void ggml_compute_forward_rope_f32(
														
 
															-        const ggml_compute_params * params,
														
 
															-        ggml_tensor * dst,
														
 
															-        const bool forward) {
														
 
															-
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const ggml_tensor * src1 = dst->src[1];
														
 
															-    const ggml_tensor * src2 = dst->src[2];
														
 
															-
														
 
															-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
														
 
															-    int sections[4];
														
 
															-
														
 
															-    //const int n_past     = ((int32_t *) dst->op_params)[0];
														
 
															-    const int n_dims     = ((int32_t *) dst->op_params)[1];
														
 
															-    const int mode       = ((int32_t *) dst->op_params)[2];
														
 
															-    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
														
 
															-    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
														
 
															-
														
 
															-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
														
 
															-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
														
 
															-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
														
 
															-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
														
 
															-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
														
 
															-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
														
 
															-    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
														
 
															-
														
 
															-    GGML_TENSOR_UNARY_OP_LOCALS
														
 
															-
														
 
															-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
														
 
															-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
														
 
															-
														
 
															-    GGML_ASSERT(nb00 == sizeof(float));
														
 
															-
														
 
															-    const int ith = params->ith;
														
 
															-    const int nth = params->nth;
														
 
															-
														
 
															-    const int nr = ggml_nrows(dst);
														
 
															-
														
 
															-    GGML_ASSERT(n_dims <= ne0);
														
 
															-    GGML_ASSERT(n_dims % 2 == 0);
														
 
															-
														
 
															-    // rows per thread
														
 
															-    const int dr = (nr + nth - 1)/nth;
														
 
															-    // row range for this thread
														
 
															-    const int ir0 = dr*ith;
														
 
															-    const int ir1 = MIN(ir0 + dr, nr);
														
 
															-
														
 
															-    // row index used to determine which thread to use
														
 
															-    int ir = 0;
														
 
															-
														
 
															-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
														
 
															-
														
 
															-    float corr_dims[2];
														
 
															-    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
														
 
															-
														
 
															-    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
														
 
															-    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, multimodal rotary position embedding
														
 
															-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
														
 
															-    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
														
 
															-
														
 
															-    if (is_mrope) {
														
 
															-        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
														
 
															-    }
														
 
															-
														
 
															-    if (is_vision) {
														
 
															-        GGML_ASSERT(n_dims == ne0/2);
														
 
															-    }
														
 
															+template<typename T>
														
 
															+static void rotate_pairs(const int64_t n, const int64_t n_offset, const float * cache, const T * src_data, T * dst_data, const int scale = 2) {
														
 
															+  for (int64_t i0 = 0; i0 < n; i0 += 2) {
														
 
															+    const int64_t ic = i0/scale; // hack for GGML_ROPE_TYPE_NORMAL, where we need ic = i0; for all other cases, ic = i0/2
														
 
															-    const float * freq_factors = NULL;
														
 
															-    if (src2 != NULL) {
														
 
															-        GGML_ASSERT(src2->type == GGML_TYPE_F32);
														
 
															-        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
														
 
															-        freq_factors = (const float *) src2->data;
														
 
															-    }
														
 
															+    const float cos_theta = cache[i0 + 0];
														
 
															+    const float sin_theta = cache[i0 + 1];
														
 
															-    // backward process uses inverse rotation by cos and sin.
														
 
															-    // cos and sin build a rotation matrix, where the inverse is the transpose.
														
 
															-    // this essentially just switches the sign of sin.
														
 
															-    const float sin_sign = forward ? 1.0f : -1.0f;
														
 
															+    const T * const src = src_data + ic;
														
 
															+    T * dst             = dst_data + ic;
														
 
															-    const int32_t * pos = (const int32_t *) src1->data;
														
 
															+    const float x0 = type_conversion_table<T>::to_f32(src[0]);
														
 
															+    const float x1 = type_conversion_table<T>::to_f32(src[n_offset]);
														
 
															-    for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
														
 
															-        for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
														
 
															-
														
 
															-            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
														
 
															-            if (!is_mrope) {
														
 
															-                const int64_t p = pos[i2];
														
 
															-                ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
														
 
															-            }
														
 
															-            else {
														
 
															-                const int64_t p_t = pos[i2];
														
 
															-                const int64_t p_h = pos[i2 + ne2];
														
 
															-                const int64_t p_w = pos[i2 + ne2 * 2];
														
 
															-                const int64_t p_e = pos[i2 + ne2 * 3];
														
 
															-                ggml_mrope_cache_init(
														
 
															-                    p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
														
 
															-                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
														
 
															-            }
														
 
															-
														
 
															-            for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
														
 
															-                if (ir++ < ir0) continue;
														
 
															-                if (ir   > ir1) break;
														
 
															-
														
 
															-                if (is_neox || is_mrope) {
														
 
															-                    if (is_vision){
														
 
															-                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
														
 
															-                            const int64_t ic = i0/2;
														
 
															-
														
 
															-                            const float cos_theta = cache[i0 + 0];
														
 
															-                            const float sin_theta = cache[i0 + 1];
														
 
															-
														
 
															-                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
														
 
															-                            float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
														
 
															-
														
 
															-                            const float x0 = src[0];
														
 
															-                            const float x1 = src[n_dims];
														
 
															-
														
 
															-                            dst_data[0]      = x0*cos_theta - x1*sin_theta;
														
 
															-                            dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
														
 
															-                        }
														
 
															-                    } else {
														
 
															-                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
														
 
															-                            const int64_t ic = i0/2;
														
 
															-
														
 
															-                            const float cos_theta = cache[i0 + 0];
														
 
															-                            const float sin_theta = cache[i0 + 1];
														
 
															-
														
 
															-                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
														
 
															-                            float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
														
 
															-
														
 
															-                            const float x0 = src[0];
														
 
															-                            const float x1 = src[n_dims/2];
														
 
															-
														
 
															-                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
														
 
															-                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
														
 
															-                        }
														
 
															-                    }
														
 
															-                } else {
														
 
															-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
														
 
															-                        const float cos_theta = cache[i0 + 0];
														
 
															-                        const float sin_theta = cache[i0 + 1];
														
 
															-
														
 
															-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
														
 
															-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
														
 
															-
														
 
															-                        const float x0 = src[0];
														
 
															-                        const float x1 = src[1];
														
 
															-
														
 
															-                        dst_data[0] = x0*cos_theta - x1*sin_theta;
														
 
															-                        dst_data[1] = x0*sin_theta + x1*cos_theta;
														
 
															-                    }
														
 
															-                }
														
 
															-
														
 
															-                if (is_vision) {
														
 
															-                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
														
 
															-                        const int64_t ic = i0/2;
														
 
															-
														
 
															-                        const float cos_theta = cache[i0 + 0];
														
 
															-                        const float sin_theta = cache[i0 + 1];
														
 
															-
														
 
															-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
														
 
															-                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
														
 
															-
														
 
															-                        const float x0 = src[0];
														
 
															-                        const float x1 = src[n_dims];
														
 
															-
														
 
															-                        dst_data[0]      = x0*cos_theta - x1*sin_theta;
														
 
															-                        dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
														
 
															-                    }
														
 
															-                } else {
														
 
															-                    // fill the remain channels with data from src tensor
														
 
															-                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
														
 
															-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
														
 
															-                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
														
 
															-
														
 
															-                        dst_data[0] = src[0];
														
 
															-                        dst_data[1] = src[1];
														
 
															-                    }
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-    }
														
 
															+    dst[0]        = type_conversion_table<T>::from_f32(x0*cos_theta - x1*sin_theta);
														
 
															+    dst[n_offset] = type_conversion_table<T>::from_f32(x0*sin_theta + x1*cos_theta);
														
 
															+  }
														
 
															 }
														
 
															-// TODO: deduplicate f16/f32 code
														
 
															-static void ggml_compute_forward_rope_f16(
														
 
															+template<typename T> //float or ggml_fp16_t
														
 
															+static void ggml_compute_forward_rope_flt(
														
 
															         const ggml_compute_params * params,
														
 
															         ggml_tensor * dst,
														
 
															         const bool forward) {
														
@@ -5699,6 +5533,9 @@ static void ggml_compute_forward_rope_f16(
 
															     const ggml_tensor * src1 = dst->src[1];
														
 
															     const ggml_tensor * src2 = dst->src[2];
														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
														
 
															+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
														
 
															+
														
 
															     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
														
 
															     int sections[4];
														
@@ -5707,6 +5544,7 @@ static void ggml_compute_forward_rope_f16(
 
															     const int mode       = ((int32_t *) dst->op_params)[2];
														
 
															     //const int n_ctx      = ((int32_t *) dst->op_params)[3];
														
 
															     const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
														
 
															+
														
 
															     memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
														
 
															     memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
														
 
															     memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
														
@@ -5715,13 +5553,13 @@ static void ggml_compute_forward_rope_f16(
 
															     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
														
 
															     memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
														
 
															-
														
 
															     GGML_TENSOR_UNARY_OP_LOCALS
														
 
															     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
														
 
															     //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
														
 
															-    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
														
 
															+    GGML_ASSERT(nb0 == nb00);
														
 
															+    GGML_ASSERT(nb0 == sizeof(T));
														
 
															     const int ith = params->ith;
														
 
															     const int nth = params->nth;
														
@@ -5746,12 +5584,11 @@ static void ggml_compute_forward_rope_f16(
 
															     float corr_dims[2];
														
 
															     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
														
 
															-    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
														
 
															-    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
														
 
															-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
														
 
															+    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
														
 
															+    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
														
 
															     const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
														
 
															-    if (is_mrope) {
														
 
															+    if (mrope_used) {
														
 
															         GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
														
 
															     }
														
@@ -5773,11 +5610,11 @@ static void ggml_compute_forward_rope_f16(
 
															     const int32_t * pos = (const int32_t *) src1->data;
														
 
															-    for (int64_t i3 = 0; i3 < ne3; i3++) {
														
 
															-        for (int64_t i2 = 0; i2 < ne2; i2++) {
														
 
															+    for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
														
 
															+        for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
														
 
															             float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
														
 
															-            if (!is_mrope) {
														
 
															+            if (!mrope_used) {
														
 
															                 const int64_t p = pos[i2];
														
 
															                 ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
														
 
															             }
														
@@ -5791,86 +5628,40 @@ static void ggml_compute_forward_rope_f16(
 
															                     freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
														
 
															             }
														
 
															-            for (int64_t i1 = 0; i1 < ne1; i1++) {
														
 
															+            for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
														
 
															                 if (ir++ < ir0) continue;
														
 
															                 if (ir   > ir1) break;
														
 
															-                if (is_neox || is_mrope) {
														
 
															-                    if (is_vision) {
														
 
															-                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
														
 
															-                            const int64_t ic = i0/2;
														
 
															-
														
 
															-                            const float cos_theta = cache[i0 + 0];
														
 
															-                            const float sin_theta = cache[i0 + 1];
														
 
															-
														
 
															-                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
														
 
															-                            ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
														
 
															-
														
 
															-                            const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
														
 
															-                            const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
														
 
															-
														
 
															-                            dst_data[0]      = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
														
 
															-                            dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
														
 
															-                        }
														
 
															-                    } else {
														
 
															-                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
														
 
															-                            const int64_t ic = i0/2;
														
 
															-
														
 
															-                            const float cos_theta = cache[i0 + 0];
														
 
															-                            const float sin_theta = cache[i0 + 1];
														
 
															-
														
 
															-                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
														
 
															-                            ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
														
 
															-
														
 
															-                            const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
														
 
															-                            const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]);
														
 
															-
														
 
															-                            dst_data[0]        = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
														
 
															-                            dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
														
 
															-                        }
														
 
															-                    }
														
 
															-                } else {
														
 
															-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
														
 
															-                        const float cos_theta = cache[i0 + 0];
														
 
															-                        const float sin_theta = cache[i0 + 1];
														
 
															-
														
 
															-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
														
 
															-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
														
 
															-
														
 
															-                        const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
														
 
															-                        const float x1 = GGML_CPU_FP16_TO_FP32(src[1]);
														
 
															-
														
 
															-                        dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
														
 
															-                        dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
														
 
															-                    }
														
 
															+                T * src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
														
 
															+                T * dst_data  = (T *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1);
														
 
															+
														
 
															+                switch (mode) {
														
 
															+                    case GGML_ROPE_TYPE_NORMAL:
														
 
															+                        rotate_pairs<T>(n_dims, 1, cache, src, dst_data, 1);
														
 
															+                        break;
														
 
															+                    case GGML_ROPE_TYPE_NEOX:
														
 
															+                    case GGML_ROPE_TYPE_MROPE:
														
 
															+                    case GGML_ROPE_TYPE_IMROPE:
														
 
															+                        rotate_pairs<T>(n_dims, n_dims/2, cache, src, dst_data);
														
 
															+                        break;
														
 
															+                    case GGML_ROPE_TYPE_VISION:
														
 
															+                        rotate_pairs<T>(ne0, n_dims, cache, src, dst_data);
														
 
															+                        break;
														
 
															+                    default:
														
 
															+                        GGML_ABORT("rope type not supported");
														
 
															                 }
														
 
															-                if (is_vision) {
														
 
															-                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
														
 
															-                        const int64_t ic = i0/2;
														
 
															-
														
 
															-                        const float cos_theta = cache[i0 + 0];
														
 
															-                        const float sin_theta = cache[i0 + 1];
														
 
															-
														
 
															-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
														
 
															-                        ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
														
 
															-
														
 
															-                        const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
														
 
															-                        const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
														
 
															-
														
 
															-                        dst_data[0]      = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
														
 
															-                        dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
														
 
															-                    }
														
 
															-                } else {
														
 
															+                if (!is_vision) {
														
 
															+                    // fill the remain channels with data from src tensor
														
 
															                     for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
														
 
															-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
														
 
															-                        ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
														
 
															+                        const T * const src = (T *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
														
 
															+                        T * dst_data  = (T *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
														
 
															                         dst_data[0] = src[0];
														
 
															                         dst_data[1] = src[1];
														
 
															                     }
														
 
															                 }
														
 
															-            }
														
 
															+            } //attn-heads
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -5884,11 +5675,11 @@ void ggml_compute_forward_rope(
 
															     switch (src0->type) {
														
 
															         case GGML_TYPE_F16:
														
 
															             {
														
 
															-                ggml_compute_forward_rope_f16(params, dst, true);
														
 
															+                ggml_compute_forward_rope_flt<ggml_fp16_t>(params, dst, true);
														
 
															             } break;
														
 
															         case GGML_TYPE_F32:
														
 
															             {
														
 
															-                ggml_compute_forward_rope_f32(params, dst, true);
														
 
															+                ggml_compute_forward_rope_flt<float>(params, dst, true);
														
 
															             } break;
														
 
															         default:
														
 
															             {
														
@@ -5908,11 +5699,11 @@ void ggml_compute_forward_rope_back(
 
															     switch (src0->type) {
														
 
															         case GGML_TYPE_F16:
														
 
															             {
														
 
															-                ggml_compute_forward_rope_f16(params, dst, false);
														
 
															+                ggml_compute_forward_rope_flt<ggml_fp16_t>(params, dst, false);
														
 
															             } break;
														
 
															         case GGML_TYPE_F32:
														
 
															             {
														
 
															-                ggml_compute_forward_rope_f32(params, dst, false);
														
 
															+                ggml_compute_forward_rope_flt<float>(params, dst, false);
														
 
															             } break;
														
 
															         default:
														
 
															             {
														
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7603,6 +7603,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 
															         test_cases.emplace_back(new test_add_id(GGML_TYPE_F32, GGML_TYPE_F32, 2880, 32, 4, n_token));
														
 
															     }
														
 
															+    for (bool fw : {true, false}) { // fw == forward
														
 
															+        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
														
 
															+            for (bool ff : {false, true}) { // freq_factors
														
 
															+                for (float v : { 0, 1 }) {
														
 
															+                    test_cases.emplace_back(new test_rope(type, {128,  32, 512, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // llama 7B
														
 
															+                    test_cases.emplace_back(new test_rope(type, {128,  64, 512, 1}, 128, GGML_ROPE_TYPE_NORMAL, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // llama 65B
														
 
															+                    test_cases.emplace_back(new test_rope(type, { 80,  32, 512, 1},  20, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (stablelm)
														
 
															+                    test_cases.emplace_back(new test_rope(type, { 64,   8, 512, 1},  64, GGML_ROPE_TYPE_NEOX, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // neox (falcon 40B)
														
 
															+                    test_cases.emplace_back(new test_rope(type, {128,  12, 512, 1}, 128, GGML_ROPE_TYPE_MROPE,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl 2B)
														
 
															+                    test_cases.emplace_back(new test_rope(type, {128,  12, 2, 1}, 128, GGML_ROPE_TYPE_IMROPE,  512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,imrope (qwen3vl 2B)
														
 
															+                    test_cases.emplace_back(new test_rope(type, { 80,  16, 2, 1},  80, GGML_ROPE_TYPE_VISION, 512, 1.0f, 0.0f, 1.0f, ff, v, fw)); // rope_multi,m-rope (qwen2vl ViT)
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															     std::vector<std::array<int64_t, 4>> reduce_rows_cases = {
														
 
															         { 8192, 1,    1, 1 },
														
 
															         { 8192, 8192, 1, 1 },
														
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
 
															     struct ggml_tensor * x;
														
 
															     // rope f32
														
 
															-    for (int m = 0; m < 6; ++m) {
														
 
															+    for (int m = 0; m < 5; ++m) {
														
 
															         const int ndims = 4;
														
 
															         const int64_t n_rot = 128;
														
@@ -153,7 +153,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
 
															         x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
														
 
															         int mode = -1;
														
 
															-        if (m < 3) {
														
 
															+        if (m < 2) {
														
 
															             struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
														
 
															             struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
														
 
															             struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
														
@@ -163,8 +163,8 @@ int main(int /*argc*/, const char ** /*argv*/) {
 
															                 ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
														
 
															                 ((int32_t *) p2->data)[i] = n_past_2 + i;
														
 
															             }
														
 
															-            // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
														
 
															-            mode = m == 0 ? 0 : m == 1 ? 2 : 4;
														
 
															+            // test mode 0, 2  (standard, GPT-NeoX)
														
 
															+            mode = m == 0 ? GGML_ROPE_TYPE_NORMAL : GGML_ROPE_TYPE_NEOX;
														
 
															             // 100, 101, 102, ..., 172
														
 
															             r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
														
@@ -180,7 +180,8 @@ int main(int /*argc*/, const char ** /*argv*/) {
 
															             struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
														
 
															             int sections[4] = {16, 24, 24, 0};
														
 
															-            mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : (m == 4) ? GGML_ROPE_TYPE_VISION : GGML_ROPE_TYPE_IMROPE;
														
 
															+
														
 
															+            mode = (m == 2) ? GGML_ROPE_TYPE_MROPE : (m == 3) ? GGML_ROPE_TYPE_VISION : GGML_ROPE_TYPE_IMROPE;
														
 
															             for (int i = 0; i < ne[2]; ++i) {
														
 
															                 for (int j = 0; j < 4; ++j) {