package compute

import (
	"fmt"
	"unsafe"

	"makarna/pkg/backend/cpu"
	"makarna/pkg/backend/cpu/nn"
	"makarna/pkg/backend/cuda"
	"makarna/pkg/backend/device"
	"makarna/pkg/tensor"
)

type MoEConfig struct {
	NumExperts           int
	TopK                 int
	IntermediateSize     int
	RouterActivationFunc string // "softmax" or "sigmoid"
	UseGroupedTopK       bool
	NumExpertGroup       int
	TopKGroup            int
	Renormalize          bool
	RoutedScalingFactor  float32
	NumSharedExperts     int
}

type MoEWeights struct {
	GateW    tensor.Tensor
	GateBias tensor.Tensor
	W1       []tensor.Tensor // gate projections per expert
	W2       []tensor.Tensor // down projections per expert
	W3       []tensor.Tensor // up projections per expert
	SharedGate, SharedUp, SharedDown tensor.Tensor
}

func HybridMoE(
	ctx *Context,
	hidden *Activation,
	weights *MoEWeights,
	cfg MoEConfig,
	out *Activation,
) error {
	// If CPUMoE is enabled, keep expert weights on CPU (saves GPU memory for large MoE models)
	if ctx != nil && ctx.CPUMoE {
		return hybridMoECPU(ctx, hidden, weights, cfg, out)
	}
	if ctx != nil && ctx.IsGPU() && hidden.IsGPU() && device.CUDAAvailable() && cuda.Available() {
		err := hybridMoEGPU(ctx, hidden, weights, cfg, out)
		if err == nil {
			return nil
		}
		// GPU failed (likely OOM), fallback to CPU
	}
	return hybridMoECPU(ctx, hidden, weights, cfg, out)
}

func hybridMoEGPU(
	ctx *Context,
	hidden *Activation,
	weights *MoEWeights,
	cfg MoEConfig,
	out *Activation,
) error {
	gpu := ctx.Placement().GPU
	seqLen := hidden.Shape()[0]
	hiddenSize := hidden.Shape()[1]

	// 1. Router: hidden -> gate scores
	gateAct, _ := NewActivation(tensor.Shape{seqLen, cfg.NumExperts}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
	defer FreeActivation(gateAct)
	if err := HybridLinear(ctx, hidden, weights.GateW, gateAct); err != nil {
		return err
	}

	gateCUDA, _ := gateAct.AsCUDA(gpu)
	gatePtr := gateCUDA.Data().(unsafe.Pointer)

	// 2. Apply router activation (softmax or sigmoid)
	if cfg.RouterActivationFunc == "softmax" {
		if err := cuda.SoftmaxRows(gatePtr, seqLen, cfg.NumExperts, gpu); err != nil {
			return err
		}
	} else {
		if err := cuda.Sigmoid(gatePtr, seqLen*cfg.NumExperts, gpu); err != nil {
			return err
		}
	}

	// 3. Add bias and TopK selection
	// For now, download to CPU for complex routing logic, then upload results
	gateCPU, err := gateAct.AsCPU()
	if err != nil {
		return err
	}
	gateData := gateCPU.DataFloat32()

	biasCPU, ok := weights.GateBias.(*cpu.Tensor)
	if !ok {
		return fmt.Errorf("moe gate bias not cpu tensor")
	}
	biasData := biasCPU.DataFloat32()

	// Initialize output on GPU
	outCUDA, err := out.AsCUDA(gpu)
	if err != nil {
		return err
	}
	outCPU := make([]float32, seqLen*hiddenSize)

	hiddenCPU, err := hidden.AsCPU()
	if err != nil {
		return err
	}
	inData := hiddenCPU.DataFloat32()

	// PRE-ALLOCATE REUSABLE BUFFERS (critical for avoiding memory leak!)
	tokAct, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
	defer FreeActivation(tokAct)
	a1, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
	defer FreeActivation(a1)
	a3, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
	defer FreeActivation(a3)
	exOut, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
	defer FreeActivation(exOut)
	sharedGateBuf, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
	defer FreeActivation(sharedGateBuf)
	sharedUpBuf, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
	defer FreeActivation(sharedUpBuf)
	sharedOutBuf, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
	defer FreeActivation(sharedOutBuf)

	// Temp CPU tensor for token data upload
	tokCPU := cpu.NewTensor(tensor.Shape{1, hiddenSize}, nil)

	for t := 0; t < seqLen; t++ {
		row := gateData[t*cfg.NumExperts : (t+1)*cfg.NumExperts]
		scores := make([]float32, cfg.NumExperts)
		copy(scores, row)

		scoresForChoice := make([]float32, cfg.NumExperts)
		for i := 0; i < cfg.NumExperts; i++ {
			scoresForChoice[i] = scores[i] + biasData[i]
		}

		masked := scoresForChoice
		if cfg.UseGroupedTopK {
			masked = nn.GroupedTopKMask(scoresForChoice, cfg.NumExpertGroup, cfg.TopKGroup)
		}

		choices := nn.SelectTopKExperts(masked, cfg.TopK, scores)
		if cfg.Renormalize && len(choices) > 1 {
			nn.RenormalizeMoEWeights(choices)
		}
		nn.ScaleMoEWeights(choices, cfg.RoutedScalingFactor)

		// Copy token data to GPU (reuse tokAct buffer)
		copy(tokCPU.DataFloat32(), inData[t*hiddenSize:(t+1)*hiddenSize])
		if tokCUDA, err := tokAct.AsCUDA(gpu); err == nil {
			tokCUDA.CopyFrom(tokCPU.DataFloat32())
		}

		acc := outCPU[t*hiddenSize : (t+1)*hiddenSize]
		for _, ch := range choices {
			ex := ch.Idx
			if ex >= len(weights.W1) || ex >= len(weights.W2) || ex >= len(weights.W3) {
				continue
			}
			w1 := weights.W1[ex]
			w2 := weights.W2[ex]
			w3 := weights.W3[ex]
			if w1 == nil || w2 == nil || w3 == nil {
				continue
			}

			// Reuse pre-allocated buffers
			HybridLinear(ctx, tokAct, w1, a1)
			HybridLinear(ctx, tokAct, w3, a3)

			// SwiGLU on GPU: a1 = silu(a1) * a3
			HybridSiLU(ctx, a1)
			HybridMul(ctx, a1, a3)

			HybridLinear(ctx, a1, w2, exOut)
			exCPU, _ := exOut.AsCPU()
			wd := ch.Weight
			for j := 0; j < hiddenSize; j++ {
				acc[j] += wd * exCPU.DataFloat32()[j]
			}
		}

		// Shared experts
		if cfg.NumSharedExperts > 0 && weights.SharedGate != nil && weights.SharedUp != nil && weights.SharedDown != nil {
			HybridLinear(ctx, tokAct, weights.SharedGate, sharedGateBuf)
			HybridLinear(ctx, tokAct, weights.SharedUp, sharedUpBuf)

			// SwiGLU on GPU
			HybridSiLU(ctx, sharedGateBuf)
			HybridMul(ctx, sharedGateBuf, sharedUpBuf)

			HybridLinear(ctx, sharedGateBuf, weights.SharedDown, sharedOutBuf)
			sCPU, _ := sharedOutBuf.AsCPU()
			for j := 0; j < hiddenSize; j++ {
				acc[j] += sCPU.DataFloat32()[j]
			}
		}
	}

	// Upload result to GPU
	outCUDA.CopyFrom(outCPU)
	return nil
}

func hybridMoECPU(
	ctx *Context,
	hidden *Activation,
	weights *MoEWeights,
	cfg MoEConfig,
	out *Activation,
) error {
	seqLen := hidden.Shape()[0]
	hiddenSize := hidden.Shape()[1]

	gateAct, _ := NewActivation(tensor.Shape{seqLen, cfg.NumExperts}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
	if err := HybridLinear(ctx, hidden, weights.GateW, gateAct); err != nil {
		return err
	}
	gateCPU, _ := gateAct.AsCPU()

	biasCPU, ok := weights.GateBias.(*cpu.Tensor)
	if !ok {
		return fmt.Errorf("moe gate bias not cpu tensor")
	}

	outCPU, _ := out.AsCPU()
	outData := outCPU.DataFloat32()

	hiddenCPU, _ := hidden.AsCPU()
	inData := hiddenCPU.DataFloat32()

	// Zero out output buffer as we will accumulate into it
	for i := range outData {
		outData[i] = 0
	}

	for t := 0; t < seqLen; t++ {
		row := gateCPU.DataFloat32()[t*cfg.NumExperts : (t+1)*cfg.NumExperts]
		scores := make([]float32, cfg.NumExperts)
		for i := 0; i < cfg.NumExperts; i++ {
			if cfg.RouterActivationFunc == "softmax" {
				scores[i] = row[i]
			} else {
				scores[i] = nn.Sigmoid(row[i])
			}
		}
		if cfg.RouterActivationFunc == "softmax" {
			nn.SoftmaxInplaceSimple(scores)
		}

		scoresForChoice := make([]float32, cfg.NumExperts)
		for i := 0; i < cfg.NumExperts; i++ {
			scoresForChoice[i] = scores[i] + biasCPU.DataFloat32()[i]
		}

		masked := scoresForChoice
		if cfg.UseGroupedTopK {
			masked = nn.GroupedTopKMask(scoresForChoice, cfg.NumExpertGroup, cfg.TopKGroup)
		}

		choices := nn.SelectTopKExperts(masked, cfg.TopK, scores)
		if cfg.Renormalize && len(choices) > 1 {
			nn.RenormalizeMoEWeights(choices)
		}
		nn.ScaleMoEWeights(choices, cfg.RoutedScalingFactor)

		tok := cpu.NewTensor(tensor.Shape{1, hiddenSize}, nil)
		copy(tok.DataFloat32(), inData[t*hiddenSize:(t+1)*hiddenSize])
		tokAct := NewActivationFrom(tok)

		acc := outData[t*hiddenSize : (t+1)*hiddenSize]
		for _, ch := range choices {
			ex := ch.Idx
			if ex >= len(weights.W1) || ex >= len(weights.W2) || ex >= len(weights.W3) {
				continue
			}
			w1 := weights.W1[ex]
			w2 := weights.W2[ex]
			w3 := weights.W3[ex]
			if w1 == nil || w2 == nil || w3 == nil {
				continue
			}

			a1, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
			a3, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
			HybridLinear(ctx, tokAct, w1, a1)
			HybridLinear(ctx, tokAct, w3, a3)

			a1CPU, _ := a1.AsCPU()
			a3CPU, _ := a3.AsCPU()
			a1d := a1CPU.DataFloat32()
			a3d := a3CPU.DataFloat32()
			for j := 0; j < cfg.IntermediateSize; j++ {
				a1d[j] = a1d[j] * nn.Sigmoid(a1d[j])
				a1d[j] = a1d[j] * a3d[j]
			}
			mix := NewActivationFrom(a1CPU)

			exOut, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
			HybridLinear(ctx, mix, w2, exOut)
			exCPU, _ := exOut.AsCPU()
			wd := ch.Weight
			for j := 0; j < hiddenSize; j++ {
				acc[j] += wd * exCPU.DataFloat32()[j]
			}
		}

		if cfg.NumSharedExperts > 0 && weights.SharedGate != nil && weights.SharedUp != nil && weights.SharedDown != nil {
			sharedGate, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
			sharedUp, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
			HybridLinear(ctx, tokAct, weights.SharedGate, sharedGate)
			HybridLinear(ctx, tokAct, weights.SharedUp, sharedUp)

			gCPU, _ := sharedGate.AsCPU()
			uCPU, _ := sharedUp.AsCPU()
			gD := gCPU.DataFloat32()
			uD := uCPU.DataFloat32()
			for j := 0; j < len(gD) && j < len(uD); j++ {
				gD[j] = gD[j] * nn.Sigmoid(gD[j])
				gD[j] = gD[j] * uD[j]
			}
			mix := NewActivationFrom(gCPU)
			sharedOut, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
			HybridLinear(ctx, mix, weights.SharedDown, sharedOut)
			sCPU, _ := sharedOut.AsCPU()
			for j := 0; j < hiddenSize; j++ {
				acc[j] += sCPU.DataFloat32()[j]
			}
		}
	}

	// Important: Copy result back to output activation (which might be on GPU)
	return out.CopyFrom(outCPU)
}