package nn

import (
	"math"

	"makarna/pkg/backend/cpu"
)

// RoPE applies Rotary Positional Embeddings in-place
// x: [seqLen, numHeads * headDim]
// positions: position for each token in sequence (len = seqLen)
// headDim: dimension of each attention head
// theta: RoPE base frequency (typically 10000 for Llama, 1000000 for Qwen3)
//
// This uses the split-half rotation format (HuggingFace standard):
// - Split head into [first_half, second_half]
// - new_first = first * cos - second * sin
// - new_second = second * cos + first * sin
func RoPE(x *cpu.Tensor, positions []int, headDim int, theta float32) error {
	data := x.DataFloat32()
	shape := x.Shape()
	seqLen := shape[0]
	totalDim := shape[1] // numHeads * headDim
	halfDim := headDim / 2

	// Precompute inverse frequencies for the half-dimension once per call
	invFreqs := make([]float64, halfDim)
	for j := 0; j < halfDim; j++ {
		invFreqs[j] = 1.0 / math.Pow(float64(theta), float64(2*j)/float64(headDim))
	}

	for seq := 0; seq < seqLen; seq++ {
		pos := positions[seq]
		rowStart := seq * totalDim

		// Apply RoPE to each head
		for headStart := 0; headStart < totalDim; headStart += headDim {
			for j := 0; j < halfDim; j++ {
				// Compute frequency: precomputed invFreq * position
				freq := float64(pos) * invFreqs[j]
				sin, cos := math.Sincos(freq)

				// Split-half indexing: pair (j, j + halfDim)
				idx0 := rowStart + headStart + j           // First half element
				idx1 := rowStart + headStart + j + halfDim // Second half element

				v0 := data[idx0] // first half value
				v1 := data[idx1] // second half value

				// Rotation:
				// new_first = first * cos - second * sin
				// new_second = second * cos + first * sin
				data[idx0] = v0*float32(cos) - v1*float32(sin)
				data[idx1] = v1*float32(cos) + v0*float32(sin)
			}
		}
	}
	return nil
}

// RoPESingle applies RoPE for a single position (for single token generation)
func RoPESingle(x *cpu.Tensor, pos, headDim int, theta float32) error {
	seqLen := x.Shape()[0]
	positions := make([]int, seqLen)
	for i := range positions {
		positions[i] = pos + i
	}
	return RoPE(x, positions, headDim, theta)
}