//go:build !cuda // Package compute provides CPU-only implementations of hybrid operations. // When CUDA is not available, all operations fall back to CPU. package compute import ( "math" "makarna/pkg/backend/cpu" "makarna/pkg/backend/cpu/matmul" "makarna/pkg/backend/cpu/nn" "makarna/pkg/tensor" ) // HybridLinear performs matrix multiplication (CPU-only path). func HybridLinear(ctx *Context, input *Activation, weight tensor.Tensor, output *Activation) error { inCPU, err := input.AsCPU() if err != nil { return err } outCPU, err := output.AsCPU() if err != nil { return err } wCPU := weight.(*cpu.Tensor) return matmul.Linear(inCPU, wCPU, outCPU) } // HybridRMSNorm applies RMS normalization in-place (CPU-only). func HybridRMSNorm(ctx *Context, x *Activation, w tensor.Tensor, eps float32) error { xCPU, err := x.AsCPU() if err != nil { return err } wCPU := w.(*cpu.Tensor) xData := xCPU.DataFloat32() wData := wCPU.DataFloat32() dim := wCPU.Shape().NumElements() numRows := xCPU.Shape().NumElements() / dim for i := 0; i < numRows; i++ { row := xData[i*dim : (i+1)*dim] ss := cpu.DotFloat32(row, row) / float32(dim) invRMS := 1.0 / float32(math.Sqrt(float64(ss+eps))) for j := 0; j < dim; j++ { row[j] = row[j] * invRMS * wData[j] } } return nil } // HybridRoPE applies rotary positional embeddings in-place (CPU-only). func HybridRoPE(ctx *Context, x *Activation, positions []int, headDim int, theta float32) error { xCPU, err := x.AsCPU() if err != nil { return err } data := xCPU.DataFloat32() shape := x.Shape() seqLen := shape[0] totalDim := shape[1] halfDim := headDim / 2 invFreqs := make([]float64, halfDim) for j := 0; j < halfDim; j++ { invFreqs[j] = 1.0 / math.Pow(float64(theta), float64(2*j)/float64(headDim)) } for seq := 0; seq < seqLen; seq++ { pos := positions[seq] rowStart := seq * totalDim for headStart := 0; headStart < totalDim; headStart += headDim { for j := 0; j < halfDim; j++ { freq := float64(pos) * invFreqs[j] sin, cos := math.Sincos(freq) idx0 := rowStart + headStart + j idx1 := rowStart + headStart + j + halfDim v0 := data[idx0] v1 := data[idx1] data[idx0] = v0*float32(cos) - v1*float32(sin) data[idx1] = v1*float32(cos) + v0*float32(sin) } } } return nil } // HybridSoftmax applies softmax along the last dimension in-place (CPU-only). func HybridSoftmax(ctx *Context, x *Activation) error { xCPU, err := x.AsCPU() if err != nil { return err } data := xCPU.DataFloat32() shape := x.Shape() rows, cols := shape[0], shape[1] for i := 0; i < rows; i++ { row := data[i*cols : (i+1)*cols] maxVal := row[0] for _, v := range row[1:] { if v > maxVal { maxVal = v } } sum := float32(0) for j := range row { row[j] = float32(math.Exp(float64(row[j] - maxVal))) sum += row[j] } for j := range row { row[j] /= sum } } return nil } // HybridSiLU applies SiLU activation in-place (CPU-only). func HybridSiLU(ctx *Context, x *Activation) error { xCPU, err := x.AsCPU() if err != nil { return err } return nn.SiLU(xCPU) } func HybridSwiGLU(ctx *Context, gate, up, out *Activation) error { if err := HybridCopy(ctx, out, gate); err != nil { return err } if err := HybridSiLU(ctx, out); err != nil { return err } return HybridMul(ctx, out, up) } // HybridMul performs element-wise multiplication: a = a * b (CPU-only). func HybridMul(ctx *Context, a, b *Activation) error { aCPU, err := a.AsCPU() if err != nil { return err } bCPU, err := b.AsCPU() if err != nil { return err } aData := aCPU.DataFloat32() bData := bCPU.DataFloat32() for i := range aData { aData[i] *= bData[i] } return nil } // HybridAdd performs element-wise addition: a = a + b (CPU-only). func HybridAdd(ctx *Context, a, b *Activation) error { aCPU, err := a.AsCPU() if err != nil { return err } bCPU, err := b.AsCPU() if err != nil { return err } aData := aCPU.DataFloat32() bData := bCPU.DataFloat32() for i := range aData { aData[i] += bData[i] } return nil } // HybridAttention computes full causal attention (CPU-only). func HybridAttention(ctx *Context, Q, K, V, out *Activation, numHeads, numKVHeads, headDim int, scale float32, startPos int) error { qCPU, err := Q.AsCPU() if err != nil { return err } kCPU, err := K.AsCPU() if err != nil { return err } vCPU, err := V.AsCPU() if err != nil { return err } outCPU, err := out.AsCPU() if err != nil { return err } qData := qCPU.DataFloat32() kData := kCPU.DataFloat32() vData := vCPU.DataFloat32() outData := outCPU.DataFloat32() seqLen := Q.Shape()[0] kvLen := K.Shape()[0] headsPerKV := numHeads / numKVHeads for h := 0; h < numHeads; h++ { kvHead := h / headsPerKV for q := 0; q < seqLen; q++ { qOffset := q*numHeads*headDim + h*headDim qVec := qData[qOffset : qOffset+headDim] scores := make([]float32, kvLen) maxScore := float32(-1e9) for k := 0; k < kvLen; k++ { if k > startPos+q { scores[k] = float32(-1e9) continue } kOffset := k*numKVHeads*headDim + kvHead*headDim kVec := kData[kOffset : kOffset+headDim] dot := float32(0) for d := 0; d < headDim; d++ { dot += qVec[d] * kVec[d] } scores[k] = dot * scale if scores[k] > maxScore { maxScore = scores[k] } } sum := float32(0) for k := range scores { scores[k] = float32(math.Exp(float64(scores[k] - maxScore))) sum += scores[k] } for k := range scores { scores[k] /= sum } outOffset := q*numHeads*headDim + h*headDim for d := 0; d < headDim; d++ { acc := float32(0) for k := 0; k < kvLen; k++ { vOffset := k*numKVHeads*headDim + kvHead*headDim acc += scores[k] * vData[vOffset+d] } outData[outOffset+d] = acc } } } return nil } // HybridCopy copies src to dst (CPU-only). func HybridCopy(ctx *Context, dst, src *Activation) error { dstCPU, err := dst.AsCPU() if err != nil { return err } srcCPU, err := src.AsCPU() if err != nil { return err } copy(dstCPU.DataFloat32(), srcCPU.DataFloat32()) return nil } // EnsureOnDevice moves activation to target device if needed (CPU-only stub). func EnsureOnDevice(a *Activation, target tensor.DevicePlacement) error { _, err := a.EnsureOn(target) return err }