| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- package nn
- import (
- "sort"
- )
- // MoEChoice represents a selected expert with its weight.
- type MoEChoice struct {
- Idx int
- Weight float32
- }
- // TopKIndices returns the indices of the top-k largest values in scores.
- func TopKIndices(scores []float32, k int) []int {
- if k <= 0 {
- k = 1
- }
- if k > len(scores) {
- k = len(scores)
- }
- idx := make([]int, len(scores))
- for i := range idx {
- idx[i] = i
- }
- sort.Slice(idx, func(i, j int) bool { return scores[idx[i]] > scores[idx[j]] })
- return idx[:k]
- }
- // SelectTopKExperts selects top-k experts from scores and returns their indices and weights.
- // If useOriginalWeights is provided, weights are taken from that slice instead of scores.
- func SelectTopKExperts(scores []float32, k int, useOriginalWeights []float32) []MoEChoice {
- if k <= 0 {
- k = 1
- }
- if k > len(scores) {
- k = len(scores)
- }
- type scored struct {
- idx int
- score float32
- }
- choices := make([]scored, len(scores))
- for i, s := range scores {
- choices[i] = scored{idx: i, score: s}
- }
- sort.Slice(choices, func(i, j int) bool { return choices[i].score > choices[j].score })
- result := make([]MoEChoice, k)
- for i := 0; i < k; i++ {
- idx := choices[i].idx
- w := scores[idx]
- if useOriginalWeights != nil {
- w = useOriginalWeights[idx]
- }
- result[i] = MoEChoice{Idx: idx, Weight: w}
- }
- return result
- }
- // GroupedTopKMask applies grouped top-k selection as in DeepSeek-V3/Kimi MoE.
- // Returns a masked score array where only experts from selected groups are non-zero.
- //
- // Parameters:
- // - scores: router scores with bias already added (for selection)
- // - numGroups: number of expert groups
- // - topKGroup: how many groups to keep
- //
- // Returns masked scores suitable for final top-k selection.
- func GroupedTopKMask(scores []float32, numGroups, topKGroup int) []float32 {
- if numGroups <= 0 {
- numGroups = 1
- }
- numExperts := len(scores)
- if numExperts%numGroups != 0 {
- return scores // fallback: no masking
- }
- perGroup := numExperts / numGroups
- // Compute group scores (sum of top-2 in each group)
- groupScores := make([]float32, numGroups)
- for gi := 0; gi < numGroups; gi++ {
- base := gi * perGroup
- seg := scores[base : base+perGroup]
- top2Idx := TopKIndices(seg, 2)
- s := float32(0)
- for _, id := range top2Idx {
- s += seg[id]
- }
- groupScores[gi] = s
- }
- // Select top-k groups
- keepGroups := TopKIndices(groupScores, topKGroup)
- keep := make([]bool, numGroups)
- for _, gi := range keepGroups {
- keep[gi] = true
- }
- // Create masked output
- masked := make([]float32, numExperts)
- for gi := 0; gi < numGroups; gi++ {
- base := gi * perGroup
- if keep[gi] {
- copy(masked[base:base+perGroup], scores[base:base+perGroup])
- }
- // else: zeros (already initialized)
- }
- return masked
- }
- // RenormalizeMoEWeights normalizes weights to sum to 1.
- func RenormalizeMoEWeights(choices []MoEChoice) {
- if len(choices) == 0 {
- return
- }
- sum := float32(0)
- for _, c := range choices {
- sum += c.Weight
- }
- if sum == 0 {
- return
- }
- inv := 1 / sum
- for i := range choices {
- choices[i].Weight *= inv
- }
- }
- // ScaleMoEWeights multiplies all weights by the given factor.
- func ScaleMoEWeights(choices []MoEChoice, factor float32) {
- for i := range choices {
- choices[i].Weight *= factor
- }
- }
- // MoERouterActivation applies activation function to router logits.
- func MoERouterActivation(logits []float32, activationFunc string) []float32 {
- scores := make([]float32, len(logits))
- switch activationFunc {
- case "sigmoid":
- for i, v := range logits {
- scores[i] = Sigmoid(v)
- }
- case "softmax":
- copy(scores, logits)
- SoftmaxInplaceSimple(scores)
- default:
- // Default to sigmoid
- for i, v := range logits {
- scores[i] = Sigmoid(v)
- }
- }
- return scores
- }
- // SoftmaxInplaceSimple applies softmax normalization in-place (simple scalar version).
- func SoftmaxInplaceSimple(data []float32) {
- if len(data) == 0 {
- return
- }
- maxVal := data[0]
- for _, v := range data[1:] {
- if v > maxVal {
- maxVal = v
- }
- }
- sum := float32(0)
- for i := range data {
- data[i] = Exp(data[i] - maxVal)
- sum += data[i]
- }
- if sum == 0 {
- return
- }
- inv := 1 / sum
- for i := range data {
- data[i] *= inv
- }
- }
|