| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- //go:build !cuda
- // Package compute provides CPU-only implementations of hybrid operations.
- // When CUDA is not available, all operations fall back to CPU.
- package compute
- import (
- "math"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/backend/cpu/matmul"
- "makarna/pkg/backend/cpu/nn"
- "makarna/pkg/tensor"
- )
- // HybridLinear performs matrix multiplication (CPU-only path).
- func HybridLinear(ctx *Context, input *Activation, weight tensor.Tensor, output *Activation) error {
- inCPU, err := input.AsCPU()
- if err != nil {
- return err
- }
- outCPU, err := output.AsCPU()
- if err != nil {
- return err
- }
- wCPU := weight.(*cpu.Tensor)
- return matmul.Linear(inCPU, wCPU, outCPU)
- }
- // HybridRMSNorm applies RMS normalization in-place (CPU-only).
- func HybridRMSNorm(ctx *Context, x *Activation, w tensor.Tensor, eps float32) error {
- xCPU, err := x.AsCPU()
- if err != nil {
- return err
- }
- wCPU := w.(*cpu.Tensor)
- xData := xCPU.DataFloat32()
- wData := wCPU.DataFloat32()
- dim := wCPU.Shape().NumElements()
- numRows := xCPU.Shape().NumElements() / dim
- for i := 0; i < numRows; i++ {
- row := xData[i*dim : (i+1)*dim]
- ss := cpu.DotFloat32(row, row) / float32(dim)
- invRMS := 1.0 / float32(math.Sqrt(float64(ss+eps)))
- for j := 0; j < dim; j++ {
- row[j] = row[j] * invRMS * wData[j]
- }
- }
- return nil
- }
- // HybridRoPE applies rotary positional embeddings in-place (CPU-only).
- func HybridRoPE(ctx *Context, x *Activation, positions []int, headDim int, theta float32) error {
- xCPU, err := x.AsCPU()
- if err != nil {
- return err
- }
- data := xCPU.DataFloat32()
- shape := x.Shape()
- seqLen := shape[0]
- totalDim := shape[1]
- halfDim := headDim / 2
- invFreqs := make([]float64, halfDim)
- for j := 0; j < halfDim; j++ {
- invFreqs[j] = 1.0 / math.Pow(float64(theta), float64(2*j)/float64(headDim))
- }
- for seq := 0; seq < seqLen; seq++ {
- pos := positions[seq]
- rowStart := seq * totalDim
- for headStart := 0; headStart < totalDim; headStart += headDim {
- for j := 0; j < halfDim; j++ {
- freq := float64(pos) * invFreqs[j]
- sin, cos := math.Sincos(freq)
- idx0 := rowStart + headStart + j
- idx1 := rowStart + headStart + j + halfDim
- v0 := data[idx0]
- v1 := data[idx1]
- data[idx0] = v0*float32(cos) - v1*float32(sin)
- data[idx1] = v1*float32(cos) + v0*float32(sin)
- }
- }
- }
- return nil
- }
- // HybridSoftmax applies softmax along the last dimension in-place (CPU-only).
- func HybridSoftmax(ctx *Context, x *Activation) error {
- xCPU, err := x.AsCPU()
- if err != nil {
- return err
- }
- data := xCPU.DataFloat32()
- shape := x.Shape()
- rows, cols := shape[0], shape[1]
- for i := 0; i < rows; i++ {
- row := data[i*cols : (i+1)*cols]
- maxVal := row[0]
- for _, v := range row[1:] {
- if v > maxVal {
- maxVal = v
- }
- }
- sum := float32(0)
- for j := range row {
- row[j] = float32(math.Exp(float64(row[j] - maxVal)))
- sum += row[j]
- }
- for j := range row {
- row[j] /= sum
- }
- }
- return nil
- }
- // HybridSiLU applies SiLU activation in-place (CPU-only).
- func HybridSiLU(ctx *Context, x *Activation) error {
- xCPU, err := x.AsCPU()
- if err != nil {
- return err
- }
- return nn.SiLU(xCPU)
- }
- func HybridSwiGLU(ctx *Context, gate, up, out *Activation) error {
- if err := HybridCopy(ctx, out, gate); err != nil {
- return err
- }
- if err := HybridSiLU(ctx, out); err != nil {
- return err
- }
- return HybridMul(ctx, out, up)
- }
- // HybridMul performs element-wise multiplication: a = a * b (CPU-only).
- func HybridMul(ctx *Context, a, b *Activation) error {
- aCPU, err := a.AsCPU()
- if err != nil {
- return err
- }
- bCPU, err := b.AsCPU()
- if err != nil {
- return err
- }
- aData := aCPU.DataFloat32()
- bData := bCPU.DataFloat32()
- for i := range aData {
- aData[i] *= bData[i]
- }
- return nil
- }
- // HybridAdd performs element-wise addition: a = a + b (CPU-only).
- func HybridAdd(ctx *Context, a, b *Activation) error {
- aCPU, err := a.AsCPU()
- if err != nil {
- return err
- }
- bCPU, err := b.AsCPU()
- if err != nil {
- return err
- }
- aData := aCPU.DataFloat32()
- bData := bCPU.DataFloat32()
- for i := range aData {
- aData[i] += bData[i]
- }
- return nil
- }
- // HybridAttention computes full causal attention (CPU-only).
- func HybridAttention(ctx *Context, Q, K, V, out *Activation, numHeads, numKVHeads, headDim int, scale float32, startPos int) error {
- qCPU, err := Q.AsCPU()
- if err != nil {
- return err
- }
- kCPU, err := K.AsCPU()
- if err != nil {
- return err
- }
- vCPU, err := V.AsCPU()
- if err != nil {
- return err
- }
- outCPU, err := out.AsCPU()
- if err != nil {
- return err
- }
- qData := qCPU.DataFloat32()
- kData := kCPU.DataFloat32()
- vData := vCPU.DataFloat32()
- outData := outCPU.DataFloat32()
- seqLen := Q.Shape()[0]
- kvLen := K.Shape()[0]
- headsPerKV := numHeads / numKVHeads
- for h := 0; h < numHeads; h++ {
- kvHead := h / headsPerKV
- for q := 0; q < seqLen; q++ {
- qOffset := q*numHeads*headDim + h*headDim
- qVec := qData[qOffset : qOffset+headDim]
- scores := make([]float32, kvLen)
- maxScore := float32(-1e9)
- for k := 0; k < kvLen; k++ {
- if k > startPos+q {
- scores[k] = float32(-1e9)
- continue
- }
- kOffset := k*numKVHeads*headDim + kvHead*headDim
- kVec := kData[kOffset : kOffset+headDim]
- dot := float32(0)
- for d := 0; d < headDim; d++ {
- dot += qVec[d] * kVec[d]
- }
- scores[k] = dot * scale
- if scores[k] > maxScore {
- maxScore = scores[k]
- }
- }
- sum := float32(0)
- for k := range scores {
- scores[k] = float32(math.Exp(float64(scores[k] - maxScore)))
- sum += scores[k]
- }
- for k := range scores {
- scores[k] /= sum
- }
- outOffset := q*numHeads*headDim + h*headDim
- for d := 0; d < headDim; d++ {
- acc := float32(0)
- for k := 0; k < kvLen; k++ {
- vOffset := k*numKVHeads*headDim + kvHead*headDim
- acc += scores[k] * vData[vOffset+d]
- }
- outData[outOffset+d] = acc
- }
- }
- }
- return nil
- }
- // HybridCopy copies src to dst (CPU-only).
- func HybridCopy(ctx *Context, dst, src *Activation) error {
- dstCPU, err := dst.AsCPU()
- if err != nil {
- return err
- }
- srcCPU, err := src.AsCPU()
- if err != nil {
- return err
- }
- copy(dstCPU.DataFloat32(), srcCPU.DataFloat32())
- return nil
- }
- // EnsureOnDevice moves activation to target device if needed (CPU-only stub).
- func EnsureOnDevice(a *Activation, target tensor.DevicePlacement) error {
- _, err := a.EnsureOn(target)
- return err
- }
|