| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- //go:build !cuda
- // Package compute provides device-agnostic neural network operations.
- package compute
- import (
- "fmt"
- "math"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/backend/cpu/nn"
- "makarna/pkg/backend/device"
- "makarna/pkg/tensor"
- )
- // RMSNorm applies RMS normalization in-place.
- // For GPU tensors, temporarily copies to CPU (CUDA kernel not implemented yet).
- func RMSNorm(ctx *Context, x, w tensor.Tensor, eps float32) error {
- // Currently always use CPU path
- // TODO: Implement CUDA RMSNorm kernel
- xCPU, err := ensureCPUForOp(x)
- if err != nil {
- return fmt.Errorf("rmsnorm: %w", err)
- }
- wCPU, ok := w.(*cpu.Tensor)
- if !ok {
- return fmt.Errorf("rmsnorm: weight must be CPU tensor")
- }
- return rmsNormCPU(xCPU, wCPU, eps)
- }
- func rmsNormCPU(x, w *cpu.Tensor, eps float32) error {
- xData := x.DataFloat32()
- wData := w.DataFloat32()
- dim := w.Shape().NumElements()
- numRows := x.Shape().NumElements() / dim
- for i := 0; i < numRows; i++ {
- row := xData[i*dim : (i+1)*dim]
- // Sum of squares
- ss := cpu.DotFloat32(row, row) / float32(dim)
- // Normalize and scale
- invRMS := 1.0 / float32(math.Sqrt(float64(ss+eps)))
- for j := 0; j < dim; j++ {
- row[j] = row[j] * invRMS * wData[j]
- }
- }
- return nil
- }
- // RoPE applies Rotary Positional Embeddings in-place.
- func RoPE(ctx *Context, x tensor.Tensor, positions []int, headDim int, theta float32) error {
- xCPU, err := ensureCPUForOp(x)
- if err != nil {
- return fmt.Errorf("rope: %w", err)
- }
- return ropeCPU(xCPU, positions, headDim, theta)
- }
- func ropeCPU(x *cpu.Tensor, positions []int, headDim int, theta float32) error {
- data := x.DataFloat32()
- shape := x.Shape()
- seqLen := shape[0]
- totalDim := shape[1]
- halfDim := headDim / 2
- invFreqs := make([]float64, halfDim)
- for j := 0; j < halfDim; j++ {
- invFreqs[j] = 1.0 / math.Pow(float64(theta), float64(2*j)/float64(headDim))
- }
- for seq := 0; seq < seqLen; seq++ {
- pos := positions[seq]
- rowStart := seq * totalDim
- for headStart := 0; headStart < totalDim; headStart += headDim {
- for j := 0; j < halfDim; j++ {
- freq := float64(pos) * invFreqs[j]
- sin, cos := math.Sincos(freq)
- idx0 := rowStart + headStart + j
- idx1 := rowStart + headStart + j + halfDim
- v0 := data[idx0]
- v1 := data[idx1]
- data[idx0] = v0*float32(cos) - v1*float32(sin)
- data[idx1] = v1*float32(cos) + v0*float32(sin)
- }
- }
- }
- return nil
- }
- // SwiGLU computes SiLU(gate) * up and stores in out.
- func SwiGLU(ctx *Context, gate, up, out tensor.Tensor) error {
- gCPU, err := ensureCPUForOp(gate)
- if err != nil {
- return err
- }
- uCPU, err := ensureCPUForOp(up)
- if err != nil {
- return err
- }
- oCPU, err := ensureCPUForOp(out)
- if err != nil {
- return err
- }
- return swigluCPU(gCPU, uCPU, oCPU)
- }
- func swigluCPU(gate, up, out *cpu.Tensor) error {
- gData := gate.DataFloat32()
- uData := up.DataFloat32()
- oData := out.DataFloat32()
- if len(oData) == 0 {
- return nil
- }
- copy(oData, gData)
- if err := nn.SiLU(out); err != nil {
- return err
- }
- for i := range oData {
- oData[i] *= uData[i]
- }
- return nil
- }
- // Softmax applies softmax along the last dimension.
- func Softmax(ctx *Context, x tensor.Tensor) error {
- xCPU, err := ensureCPUForOp(x)
- if err != nil {
- return err
- }
- return softmaxCPU(xCPU)
- }
- func softmaxCPU(x *cpu.Tensor) error {
- data := x.DataFloat32()
- shape := x.Shape()
- if len(shape) != 2 {
- return fmt.Errorf("softmax: expected 2D tensor")
- }
- rows := shape[0]
- cols := shape[1]
- for i := 0; i < rows; i++ {
- row := data[i*cols : (i+1)*cols]
- // Find max for numerical stability
- maxVal := row[0]
- for _, v := range row[1:] {
- if v > maxVal {
- maxVal = v
- }
- }
- // Exp and sum
- sum := float32(0)
- for j := range row {
- row[j] = float32(math.Exp(float64(row[j] - maxVal)))
- sum += row[j]
- }
- // Normalize
- for j := range row {
- row[j] /= sum
- }
- }
- return nil
- }
- // Add performs element-wise addition: dst += src
- func Add(dst, src tensor.Tensor) error {
- dCPU, err := ensureCPUForOp(dst)
- if err != nil {
- return err
- }
- sCPU, err := ensureCPUForOp(src)
- if err != nil {
- return err
- }
- dData := dCPU.DataFloat32()
- sData := sCPU.DataFloat32()
- for i := range dData {
- dData[i] += sData[i]
- }
- return nil
- }
- // CopyData copies tensor data: dst = src
- func CopyData(dst, src tensor.Tensor) error {
- dCPU, err := ensureCPUForOp(dst)
- if err != nil {
- return err
- }
- sCPU, err := ensureCPUForOp(src)
- if err != nil {
- return err
- }
- copy(dCPU.DataFloat32(), sCPU.DataFloat32())
- return nil
- }
- // ensureCPUForOp converts tensor to CPU if needed (temporary until CUDA kernels are done)
- func ensureCPUForOp(t tensor.Tensor) (*cpu.Tensor, error) {
- if cpuT, ok := t.(*cpu.Tensor); ok {
- return cpuT, nil
- }
- // Convert to CPU
- result, err := device.EnsureOn(t, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
- if err != nil {
- return nil, err
- }
- cpuT, ok := result.(*cpu.Tensor)
- if !ok {
- return nil, fmt.Errorf("expected CPU tensor after conversion")
- }
- return cpuT, nil
- }
|