package nn import ( "math" "makarna/pkg/backend/cpu" ) // SiLU applies x * sigmoid(x) in-place using the fastest available kernel. func SiLU(x *cpu.Tensor) error { siluInplace(x.DataFloat32()) return nil } // SwiGLU: out = SiLU(gate) * up. Does not mutate gate. func SwiGLU(gate, up, out *cpu.Tensor) error { gData := gate.DataFloat32() uData := up.DataFloat32() oData := out.DataFloat32() if len(oData) == 0 { return nil } if &gData[0] != &oData[0] { copy(oData, gData) } siluInplace(oData) for i := range oData { oData[i] *= uData[i] } return nil } // siluInplace selects the SIMD kernel when available, falling back to scalar. func siluInplace(data []float32) { if len(data) == 0 { return } switch { case hasSiLUAVX512 && cpu.SupportsAVX512(): main := len(data) &^ 15 if main > 0 { siluAVX512Asm(&data[0], main) } if main == len(data) { return } data = data[main:] case hasSiLUAVX2 && cpu.SupportsAVX2(): main := len(data) &^ 7 if main > 0 { siluAVX2Asm(&data[0], main) } if main == len(data) { return } data = data[main:] } siluScalar(data) } func siluScalar(data []float32) { for i := range data { v := data[i] data[i] = v / (1.0 + float32(math.Exp(float64(-v)))) } }