package nn import ( "sort" ) // MoEChoice represents a selected expert with its weight. type MoEChoice struct { Idx int Weight float32 } // TopKIndices returns the indices of the top-k largest values in scores. func TopKIndices(scores []float32, k int) []int { if k <= 0 { k = 1 } if k > len(scores) { k = len(scores) } idx := make([]int, len(scores)) for i := range idx { idx[i] = i } sort.Slice(idx, func(i, j int) bool { return scores[idx[i]] > scores[idx[j]] }) return idx[:k] } // SelectTopKExperts selects top-k experts from scores and returns their indices and weights. // If useOriginalWeights is provided, weights are taken from that slice instead of scores. func SelectTopKExperts(scores []float32, k int, useOriginalWeights []float32) []MoEChoice { if k <= 0 { k = 1 } if k > len(scores) { k = len(scores) } type scored struct { idx int score float32 } choices := make([]scored, len(scores)) for i, s := range scores { choices[i] = scored{idx: i, score: s} } sort.Slice(choices, func(i, j int) bool { return choices[i].score > choices[j].score }) result := make([]MoEChoice, k) for i := 0; i < k; i++ { idx := choices[i].idx w := scores[idx] if useOriginalWeights != nil { w = useOriginalWeights[idx] } result[i] = MoEChoice{Idx: idx, Weight: w} } return result } // GroupedTopKMask applies grouped top-k selection as in DeepSeek-V3/Kimi MoE. // Returns a masked score array where only experts from selected groups are non-zero. // // Parameters: // - scores: router scores with bias already added (for selection) // - numGroups: number of expert groups // - topKGroup: how many groups to keep // // Returns masked scores suitable for final top-k selection. func GroupedTopKMask(scores []float32, numGroups, topKGroup int) []float32 { if numGroups <= 0 { numGroups = 1 } numExperts := len(scores) if numExperts%numGroups != 0 { return scores // fallback: no masking } perGroup := numExperts / numGroups // Compute group scores (sum of top-2 in each group) groupScores := make([]float32, numGroups) for gi := 0; gi < numGroups; gi++ { base := gi * perGroup seg := scores[base : base+perGroup] top2Idx := TopKIndices(seg, 2) s := float32(0) for _, id := range top2Idx { s += seg[id] } groupScores[gi] = s } // Select top-k groups keepGroups := TopKIndices(groupScores, topKGroup) keep := make([]bool, numGroups) for _, gi := range keepGroups { keep[gi] = true } // Create masked output masked := make([]float32, numExperts) for gi := 0; gi < numGroups; gi++ { base := gi * perGroup if keep[gi] { copy(masked[base:base+perGroup], scores[base:base+perGroup]) } // else: zeros (already initialized) } return masked } // RenormalizeMoEWeights normalizes weights to sum to 1. func RenormalizeMoEWeights(choices []MoEChoice) { if len(choices) == 0 { return } sum := float32(0) for _, c := range choices { sum += c.Weight } if sum == 0 { return } inv := 1 / sum for i := range choices { choices[i].Weight *= inv } } // ScaleMoEWeights multiplies all weights by the given factor. func ScaleMoEWeights(choices []MoEChoice, factor float32) { for i := range choices { choices[i].Weight *= factor } } // MoERouterActivation applies activation function to router logits. func MoERouterActivation(logits []float32, activationFunc string) []float32 { scores := make([]float32, len(logits)) switch activationFunc { case "sigmoid": for i, v := range logits { scores[i] = Sigmoid(v) } case "softmax": copy(scores, logits) SoftmaxInplaceSimple(scores) default: // Default to sigmoid for i, v := range logits { scores[i] = Sigmoid(v) } } return scores } // SoftmaxInplaceSimple applies softmax normalization in-place (simple scalar version). func SoftmaxInplaceSimple(data []float32) { if len(data) == 0 { return } maxVal := data[0] for _, v := range data[1:] { if v > maxVal { maxVal = v } } sum := float32(0) for i := range data { data[i] = Exp(data[i] - maxVal) sum += data[i] } if sum == 0 { return } inv := 1 / sum for i := range data { data[i] *= inv } }