package nn

import (
	"sort"
)

// MoEChoice represents a selected expert with its weight.
type MoEChoice struct {
	Idx    int
	Weight float32
}

// TopKIndices returns the indices of the top-k largest values in scores.
func TopKIndices(scores []float32, k int) []int {
	if k <= 0 {
		k = 1
	}
	if k > len(scores) {
		k = len(scores)
	}
	idx := make([]int, len(scores))
	for i := range idx {
		idx[i] = i
	}
	sort.Slice(idx, func(i, j int) bool { return scores[idx[i]] > scores[idx[j]] })
	return idx[:k]
}

// SelectTopKExperts selects top-k experts from scores and returns their indices and weights.
// If useOriginalWeights is provided, weights are taken from that slice instead of scores.
func SelectTopKExperts(scores []float32, k int, useOriginalWeights []float32) []MoEChoice {
	if k <= 0 {
		k = 1
	}
	if k > len(scores) {
		k = len(scores)
	}

	type scored struct {
		idx   int
		score float32
	}
	choices := make([]scored, len(scores))
	for i, s := range scores {
		choices[i] = scored{idx: i, score: s}
	}
	sort.Slice(choices, func(i, j int) bool { return choices[i].score > choices[j].score })

	result := make([]MoEChoice, k)
	for i := 0; i < k; i++ {
		idx := choices[i].idx
		w := scores[idx]
		if useOriginalWeights != nil {
			w = useOriginalWeights[idx]
		}
		result[i] = MoEChoice{Idx: idx, Weight: w}
	}
	return result
}

// GroupedTopKMask applies grouped top-k selection as in DeepSeek-V3/Kimi MoE.
// Returns a masked score array where only experts from selected groups are non-zero.
//
// Parameters:
//   - scores: router scores with bias already added (for selection)
//   - numGroups: number of expert groups
//   - topKGroup: how many groups to keep
//
// Returns masked scores suitable for final top-k selection.
func GroupedTopKMask(scores []float32, numGroups, topKGroup int) []float32 {
	if numGroups <= 0 {
		numGroups = 1
	}
	numExperts := len(scores)
	if numExperts%numGroups != 0 {
		return scores // fallback: no masking
	}

	perGroup := numExperts / numGroups

	// Compute group scores (sum of top-2 in each group)
	groupScores := make([]float32, numGroups)
	for gi := 0; gi < numGroups; gi++ {
		base := gi * perGroup
		seg := scores[base : base+perGroup]
		top2Idx := TopKIndices(seg, 2)
		s := float32(0)
		for _, id := range top2Idx {
			s += seg[id]
		}
		groupScores[gi] = s
	}

	// Select top-k groups
	keepGroups := TopKIndices(groupScores, topKGroup)
	keep := make([]bool, numGroups)
	for _, gi := range keepGroups {
		keep[gi] = true
	}

	// Create masked output
	masked := make([]float32, numExperts)
	for gi := 0; gi < numGroups; gi++ {
		base := gi * perGroup
		if keep[gi] {
			copy(masked[base:base+perGroup], scores[base:base+perGroup])
		}
		// else: zeros (already initialized)
	}
	return masked
}

// RenormalizeMoEWeights normalizes weights to sum to 1.
func RenormalizeMoEWeights(choices []MoEChoice) {
	if len(choices) == 0 {
		return
	}
	sum := float32(0)
	for _, c := range choices {
		sum += c.Weight
	}
	if sum == 0 {
		return
	}
	inv := 1 / sum
	for i := range choices {
		choices[i].Weight *= inv
	}
}

// ScaleMoEWeights multiplies all weights by the given factor.
func ScaleMoEWeights(choices []MoEChoice, factor float32) {
	for i := range choices {
		choices[i].Weight *= factor
	}
}

// MoERouterActivation applies activation function to router logits.
func MoERouterActivation(logits []float32, activationFunc string) []float32 {
	scores := make([]float32, len(logits))
	switch activationFunc {
	case "sigmoid":
		for i, v := range logits {
			scores[i] = Sigmoid(v)
		}
	case "softmax":
		copy(scores, logits)
		SoftmaxInplaceSimple(scores)
	default:
		// Default to sigmoid
		for i, v := range logits {
			scores[i] = Sigmoid(v)
		}
	}
	return scores
}

// SoftmaxInplaceSimple applies softmax normalization in-place (simple scalar version).
func SoftmaxInplaceSimple(data []float32) {
	if len(data) == 0 {
		return
	}
	maxVal := data[0]
	for _, v := range data[1:] {
		if v > maxVal {
			maxVal = v
		}
	}
	sum := float32(0)
	for i := range data {
		data[i] = Exp(data[i] - maxVal)
		sum += data[i]
	}
	if sum == 0 {
		return
	}
	inv := 1 / sum
	for i := range data {
		data[i] *= inv
	}
}