//go:build !cuda // Package compute provides device-agnostic neural network operations. package compute import ( "fmt" "math" "makarna/pkg/backend/cpu" "makarna/pkg/backend/cpu/nn" "makarna/pkg/backend/device" "makarna/pkg/tensor" ) // RMSNorm applies RMS normalization in-place. // For GPU tensors, temporarily copies to CPU (CUDA kernel not implemented yet). func RMSNorm(ctx *Context, x, w tensor.Tensor, eps float32) error { // Currently always use CPU path // TODO: Implement CUDA RMSNorm kernel xCPU, err := ensureCPUForOp(x) if err != nil { return fmt.Errorf("rmsnorm: %w", err) } wCPU, ok := w.(*cpu.Tensor) if !ok { return fmt.Errorf("rmsnorm: weight must be CPU tensor") } return rmsNormCPU(xCPU, wCPU, eps) } func rmsNormCPU(x, w *cpu.Tensor, eps float32) error { xData := x.DataFloat32() wData := w.DataFloat32() dim := w.Shape().NumElements() numRows := x.Shape().NumElements() / dim for i := 0; i < numRows; i++ { row := xData[i*dim : (i+1)*dim] // Sum of squares ss := cpu.DotFloat32(row, row) / float32(dim) // Normalize and scale invRMS := 1.0 / float32(math.Sqrt(float64(ss+eps))) for j := 0; j < dim; j++ { row[j] = row[j] * invRMS * wData[j] } } return nil } // RoPE applies Rotary Positional Embeddings in-place. func RoPE(ctx *Context, x tensor.Tensor, positions []int, headDim int, theta float32) error { xCPU, err := ensureCPUForOp(x) if err != nil { return fmt.Errorf("rope: %w", err) } return ropeCPU(xCPU, positions, headDim, theta) } func ropeCPU(x *cpu.Tensor, positions []int, headDim int, theta float32) error { data := x.DataFloat32() shape := x.Shape() seqLen := shape[0] totalDim := shape[1] halfDim := headDim / 2 invFreqs := make([]float64, halfDim) for j := 0; j < halfDim; j++ { invFreqs[j] = 1.0 / math.Pow(float64(theta), float64(2*j)/float64(headDim)) } for seq := 0; seq < seqLen; seq++ { pos := positions[seq] rowStart := seq * totalDim for headStart := 0; headStart < totalDim; headStart += headDim { for j := 0; j < halfDim; j++ { freq := float64(pos) * invFreqs[j] sin, cos := math.Sincos(freq) idx0 := rowStart + headStart + j idx1 := rowStart + headStart + j + halfDim v0 := data[idx0] v1 := data[idx1] data[idx0] = v0*float32(cos) - v1*float32(sin) data[idx1] = v1*float32(cos) + v0*float32(sin) } } } return nil } // SwiGLU computes SiLU(gate) * up and stores in out. func SwiGLU(ctx *Context, gate, up, out tensor.Tensor) error { gCPU, err := ensureCPUForOp(gate) if err != nil { return err } uCPU, err := ensureCPUForOp(up) if err != nil { return err } oCPU, err := ensureCPUForOp(out) if err != nil { return err } return swigluCPU(gCPU, uCPU, oCPU) } func swigluCPU(gate, up, out *cpu.Tensor) error { gData := gate.DataFloat32() uData := up.DataFloat32() oData := out.DataFloat32() if len(oData) == 0 { return nil } copy(oData, gData) if err := nn.SiLU(out); err != nil { return err } for i := range oData { oData[i] *= uData[i] } return nil } // Softmax applies softmax along the last dimension. func Softmax(ctx *Context, x tensor.Tensor) error { xCPU, err := ensureCPUForOp(x) if err != nil { return err } return softmaxCPU(xCPU) } func softmaxCPU(x *cpu.Tensor) error { data := x.DataFloat32() shape := x.Shape() if len(shape) != 2 { return fmt.Errorf("softmax: expected 2D tensor") } rows := shape[0] cols := shape[1] for i := 0; i < rows; i++ { row := data[i*cols : (i+1)*cols] // Find max for numerical stability maxVal := row[0] for _, v := range row[1:] { if v > maxVal { maxVal = v } } // Exp and sum sum := float32(0) for j := range row { row[j] = float32(math.Exp(float64(row[j] - maxVal))) sum += row[j] } // Normalize for j := range row { row[j] /= sum } } return nil } // Add performs element-wise addition: dst += src func Add(dst, src tensor.Tensor) error { dCPU, err := ensureCPUForOp(dst) if err != nil { return err } sCPU, err := ensureCPUForOp(src) if err != nil { return err } dData := dCPU.DataFloat32() sData := sCPU.DataFloat32() for i := range dData { dData[i] += sData[i] } return nil } // CopyData copies tensor data: dst = src func CopyData(dst, src tensor.Tensor) error { dCPU, err := ensureCPUForOp(dst) if err != nil { return err } sCPU, err := ensureCPUForOp(src) if err != nil { return err } copy(dCPU.DataFloat32(), sCPU.DataFloat32()) return nil } // ensureCPUForOp converts tensor to CPU if needed (temporary until CUDA kernels are done) func ensureCPUForOp(t tensor.Tensor) (*cpu.Tensor, error) { if cpuT, ok := t.(*cpu.Tensor); ok { return cpuT, nil } // Convert to CPU result, err := device.EnsureOn(t, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}) if err != nil { return nil, err } cpuT, ok := result.(*cpu.Tensor) if !ok { return nil, fmt.Errorf("expected CPU tensor after conversion") } return cpuT, nil }