//go:build cuda

// Package compute provides CUDA-accelerated neural network operations.
package compute

import (
	"fmt"
	"math"
	"unsafe"

	"makarna/pkg/backend/cpu"
	"makarna/pkg/backend/cuda"
	"makarna/pkg/backend/device"
	"makarna/pkg/profile"
	"makarna/pkg/tensor"
)

// RMSNorm applies RMS normalization in-place.
// Uses CUDA when on GPU layer.
func RMSNorm(ctx *Context, x, w tensor.Tensor, eps float32) error {
	useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable()

	if !useGPU {
		profile.Start("RMSNorm/CPU")
		err := rmsNormCPU(x, w, eps)
		profile.End("RMSNorm/CPU")
		return err
	}

	profile.Start("RMSNorm/GPU")
	defer profile.End("RMSNorm/GPU")

	// Get GPU pointers
	xCPU, ok := x.(*cpu.Tensor)
	if !ok {
		return fmt.Errorf("rmsnorm: x must be CPU tensor")
	}
	wCPU, ok := w.(*cpu.Tensor)
	if !ok {
		return fmt.Errorf("rmsnorm: w must be CPU tensor")
	}

	gpu := ctx.Placement().GPU
	shape := x.Shape()
	seqLen := shape[0]
	dim := shape[1]

	// Upload x to GPU
	profile.Start("RMSNorm/alloc_x")
	gpuX, err := cuda.NewTensor(shape, tensor.Float32, gpu)
	profile.End("RMSNorm/alloc_x")
	if err != nil {
		return err
	}
	if err := gpuX.CopyFrom(xCPU.DataFloat32()); err != nil {
		return err
	}

	// Get or upload weight
	cache := GetWeightCache(gpu)
	wKey := fmt.Sprintf("rmsnorm_%p", wCPU)
	gpuW, ok := cache.Get(wKey)
	if !ok {
		gpuW, err = cache.Upload(wKey, wCPU)
		if err != nil {
			return err
		}
	}

	// Run CUDA RMSNorm
	profile.Start("RMSNorm/kernel")
	err = cuda.RMSNorm(gpuX.Data().(unsafe.Pointer), gpuW, seqLen, dim, eps, gpu)
	profile.End("RMSNorm/kernel")
	if err != nil {
		return err
	}

	// Copy back
	if err := gpuX.CopyToHost(xCPU.DataFloat32()); err != nil {
		return err
	}

	return nil
}

func rmsNormCPU(x, w tensor.Tensor, eps float32) error {
	xCPU := x.(*cpu.Tensor)
	wCPU := w.(*cpu.Tensor)

	xData := xCPU.DataFloat32()
	wData := wCPU.DataFloat32()

	dim := wCPU.Shape().NumElements()
	numRows := xCPU.Shape().NumElements() / dim

	for i := 0; i < numRows; i++ {
		row := xData[i*dim : (i+1)*dim]
		ss := cpu.DotFloat32(row, row) / float32(dim)
		invRMS := 1.0 / float32(math.Sqrt(float64(ss+eps)))
		for j := 0; j < dim; j++ {
			row[j] = row[j] * invRMS * wData[j]
		}
	}
	return nil
}

// RoPE applies Rotary Positional Embeddings in-place.
func RoPE(ctx *Context, x tensor.Tensor, positions []int, headDim int, theta float32) error {
	useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable()

	if !useGPU {
		profile.Start("RoPE/CPU")
		err := ropeCPU(x, positions, headDim, theta)
		profile.End("RoPE/CPU")
		return err
	}

	profile.Start("RoPE/GPU")
	defer profile.End("RoPE/GPU")

	xCPU := x.(*cpu.Tensor)
	gpu := ctx.Placement().GPU
	shape := x.Shape()
	seqLen := shape[0]
	totalDim := shape[1]
	numHeads := totalDim / headDim

	// Upload x to GPU
	gpuX, err := cuda.NewTensor(shape, tensor.Float32, gpu)
	if err != nil {
		return err
	}
	if err := gpuX.CopyFrom(xCPU.DataFloat32()); err != nil {
		return err
	}

	// Upload positions
	posData := make([]int32, len(positions))
	for i, p := range positions {
		posData[i] = int32(p)
	}
	gpuPos, err := cuda.NewTensor(tensor.Shape{len(positions)}, tensor.Float32, gpu) // Reuse for int alloc
	if err != nil {
		return err
	}
	// Manual copy as int32
	posPtr := unsafe.Pointer(&posData[0])
	if err := gpuPos.CopyFrom(unsafe.Slice((*float32)(posPtr), len(posData))); err != nil {
		return err
	}

	// Run CUDA RoPE
	profile.Start("RoPE/kernel")
	err = cuda.RoPE(gpuX.Data().(unsafe.Pointer), gpuPos.Data().(unsafe.Pointer), seqLen, numHeads, headDim, theta, gpu)
	profile.End("RoPE/kernel")
	if err != nil {
		return err
	}

	// Copy back
	if err := gpuX.CopyToHost(xCPU.DataFloat32()); err != nil {
		return err
	}

	return nil
}

func ropeCPU(x tensor.Tensor, positions []int, headDim int, theta float32) error {
	xCPU := x.(*cpu.Tensor)
	data := xCPU.DataFloat32()
	shape := x.Shape()
	seqLen := shape[0]
	totalDim := shape[1]
	halfDim := headDim / 2

	invFreqs := make([]float64, halfDim)
	for j := 0; j < halfDim; j++ {
		invFreqs[j] = 1.0 / math.Pow(float64(theta), float64(2*j)/float64(headDim))
	}

	for seq := 0; seq < seqLen; seq++ {
		pos := positions[seq]
		rowStart := seq * totalDim

		for headStart := 0; headStart < totalDim; headStart += headDim {
			for j := 0; j < halfDim; j++ {
				freq := float64(pos) * invFreqs[j]
				sin, cos := math.Sincos(freq)

				idx0 := rowStart + headStart + j
				idx1 := rowStart + headStart + j + halfDim

				v0 := data[idx0]
				v1 := data[idx1]

				data[idx0] = v0*float32(cos) - v1*float32(sin)
				data[idx1] = v1*float32(cos) + v0*float32(sin)
			}
		}
	}
	return nil
}

// SwiGLU computes SiLU(gate) * up and stores in out.
func SwiGLU(ctx *Context, gate, up, out tensor.Tensor) error {
	useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable()

	if !useGPU {
		profile.Start("SwiGLU/CPU")
		err := swigluCPU(gate, up, out)
		profile.End("SwiGLU/CPU")
		return err
	}

	profile.Start("SwiGLU/GPU")
	defer profile.End("SwiGLU/GPU")

	gCPU := gate.(*cpu.Tensor)
	uCPU := up.(*cpu.Tensor)
	oCPU := out.(*cpu.Tensor)

	gpu := ctx.Placement().GPU
	n := gate.Shape().NumElements()

	// Upload gate and up to GPU
	gpuGate, err := cuda.NewTensor(gate.Shape(), tensor.Float32, gpu)
	if err != nil {
		return err
	}
	gpuUp, err := cuda.NewTensor(up.Shape(), tensor.Float32, gpu)
	if err != nil {
		return err
	}

	if err := gpuGate.CopyFrom(gCPU.DataFloat32()); err != nil {
		return err
	}
	if err := gpuUp.CopyFrom(uCPU.DataFloat32()); err != nil {
		return err
	}

	// SiLU(gate) in-place
	profile.Start("SwiGLU/silu_kernel")
	if err := cuda.SiLU(gpuGate.Data().(unsafe.Pointer), n, gpu); err != nil {
		profile.End("SwiGLU/silu_kernel")
		return err
	}
	profile.End("SwiGLU/silu_kernel")

	// gate = gate * up
	profile.Start("SwiGLU/mul_kernel")
	if err := cuda.MulInplace(gpuGate.Data().(unsafe.Pointer), gpuUp.Data().(unsafe.Pointer), n, gpu); err != nil {
		profile.End("SwiGLU/mul_kernel")
		return err
	}
	profile.End("SwiGLU/mul_kernel")

	// Copy to output
	if err := gpuGate.CopyToHost(oCPU.DataFloat32()); err != nil {
		return err
	}

	return nil
}

func swigluCPU(gate, up, out tensor.Tensor) error {
	gCPU := gate.(*cpu.Tensor)
	uCPU := up.(*cpu.Tensor)
	oCPU := out.(*cpu.Tensor)

	gData := gCPU.DataFloat32()
	uData := uCPU.DataFloat32()
	oData := oCPU.DataFloat32()

	for i := range gData {
		gv := gData[i]
		silu := gv / (1.0 + float32(math.Exp(float64(-gv))))
		oData[i] = silu * uData[i]
	}
	return nil
}

// Softmax applies softmax along the last dimension.
func Softmax(ctx *Context, x tensor.Tensor) error {
	xCPU := x.(*cpu.Tensor)
	return softmaxCPU(xCPU)
}

func softmaxCPU(x *cpu.Tensor) error {
	data := x.DataFloat32()
	shape := x.Shape()
	if len(shape) != 2 {
		return fmt.Errorf("softmax: expected 2D tensor")
	}

	rows := shape[0]
	cols := shape[1]

	for i := 0; i < rows; i++ {
		row := data[i*cols : (i+1)*cols]
		maxVal := row[0]
		for _, v := range row[1:] {
			if v > maxVal {
				maxVal = v
			}
		}
		sum := float32(0)
		for j := range row {
			row[j] = float32(math.Exp(float64(row[j] - maxVal)))
			sum += row[j]
		}
		for j := range row {
			row[j] /= sum
		}
	}
	return nil
}

// Add performs element-wise addition: dst += src
func Add(dst, src tensor.Tensor) error {
	dCPU := dst.(*cpu.Tensor)
	sCPU := src.(*cpu.Tensor)

	dData := dCPU.DataFloat32()
	sData := sCPU.DataFloat32()

	for i := range dData {
		dData[i] += sData[i]
	}
	return nil
}

// CopyData copies tensor data: dst = src
func CopyData(dst, src tensor.Tensor) error {
	dCPU := dst.(*cpu.Tensor)
	sCPU := src.(*cpu.Tensor)

	copy(dCPU.DataFloat32(), sCPU.DataFloat32())
	return nil
}