//go:build cuda // Package compute provides CUDA-accelerated neural network operations. package compute import ( "fmt" "math" "unsafe" "makarna/pkg/backend/cpu" "makarna/pkg/backend/cuda" "makarna/pkg/backend/device" "makarna/pkg/profile" "makarna/pkg/tensor" ) // RMSNorm applies RMS normalization in-place. // Uses CUDA when on GPU layer. func RMSNorm(ctx *Context, x, w tensor.Tensor, eps float32) error { useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable() if !useGPU { profile.Start("RMSNorm/CPU") err := rmsNormCPU(x, w, eps) profile.End("RMSNorm/CPU") return err } profile.Start("RMSNorm/GPU") defer profile.End("RMSNorm/GPU") // Get GPU pointers xCPU, ok := x.(*cpu.Tensor) if !ok { return fmt.Errorf("rmsnorm: x must be CPU tensor") } wCPU, ok := w.(*cpu.Tensor) if !ok { return fmt.Errorf("rmsnorm: w must be CPU tensor") } gpu := ctx.Placement().GPU shape := x.Shape() seqLen := shape[0] dim := shape[1] // Upload x to GPU profile.Start("RMSNorm/alloc_x") gpuX, err := cuda.NewTensor(shape, tensor.Float32, gpu) profile.End("RMSNorm/alloc_x") if err != nil { return err } if err := gpuX.CopyFrom(xCPU.DataFloat32()); err != nil { return err } // Get or upload weight cache := GetWeightCache(gpu) wKey := fmt.Sprintf("rmsnorm_%p", wCPU) gpuW, ok := cache.Get(wKey) if !ok { gpuW, err = cache.Upload(wKey, wCPU) if err != nil { return err } } // Run CUDA RMSNorm profile.Start("RMSNorm/kernel") err = cuda.RMSNorm(gpuX.Data().(unsafe.Pointer), gpuW, seqLen, dim, eps, gpu) profile.End("RMSNorm/kernel") if err != nil { return err } // Copy back if err := gpuX.CopyToHost(xCPU.DataFloat32()); err != nil { return err } return nil } func rmsNormCPU(x, w tensor.Tensor, eps float32) error { xCPU := x.(*cpu.Tensor) wCPU := w.(*cpu.Tensor) xData := xCPU.DataFloat32() wData := wCPU.DataFloat32() dim := wCPU.Shape().NumElements() numRows := xCPU.Shape().NumElements() / dim for i := 0; i < numRows; i++ { row := xData[i*dim : (i+1)*dim] ss := cpu.DotFloat32(row, row) / float32(dim) invRMS := 1.0 / float32(math.Sqrt(float64(ss+eps))) for j := 0; j < dim; j++ { row[j] = row[j] * invRMS * wData[j] } } return nil } // RoPE applies Rotary Positional Embeddings in-place. func RoPE(ctx *Context, x tensor.Tensor, positions []int, headDim int, theta float32) error { useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable() if !useGPU { profile.Start("RoPE/CPU") err := ropeCPU(x, positions, headDim, theta) profile.End("RoPE/CPU") return err } profile.Start("RoPE/GPU") defer profile.End("RoPE/GPU") xCPU := x.(*cpu.Tensor) gpu := ctx.Placement().GPU shape := x.Shape() seqLen := shape[0] totalDim := shape[1] numHeads := totalDim / headDim // Upload x to GPU gpuX, err := cuda.NewTensor(shape, tensor.Float32, gpu) if err != nil { return err } if err := gpuX.CopyFrom(xCPU.DataFloat32()); err != nil { return err } // Upload positions posData := make([]int32, len(positions)) for i, p := range positions { posData[i] = int32(p) } gpuPos, err := cuda.NewTensor(tensor.Shape{len(positions)}, tensor.Float32, gpu) // Reuse for int alloc if err != nil { return err } // Manual copy as int32 posPtr := unsafe.Pointer(&posData[0]) if err := gpuPos.CopyFrom(unsafe.Slice((*float32)(posPtr), len(posData))); err != nil { return err } // Run CUDA RoPE profile.Start("RoPE/kernel") err = cuda.RoPE(gpuX.Data().(unsafe.Pointer), gpuPos.Data().(unsafe.Pointer), seqLen, numHeads, headDim, theta, gpu) profile.End("RoPE/kernel") if err != nil { return err } // Copy back if err := gpuX.CopyToHost(xCPU.DataFloat32()); err != nil { return err } return nil } func ropeCPU(x tensor.Tensor, positions []int, headDim int, theta float32) error { xCPU := x.(*cpu.Tensor) data := xCPU.DataFloat32() shape := x.Shape() seqLen := shape[0] totalDim := shape[1] halfDim := headDim / 2 invFreqs := make([]float64, halfDim) for j := 0; j < halfDim; j++ { invFreqs[j] = 1.0 / math.Pow(float64(theta), float64(2*j)/float64(headDim)) } for seq := 0; seq < seqLen; seq++ { pos := positions[seq] rowStart := seq * totalDim for headStart := 0; headStart < totalDim; headStart += headDim { for j := 0; j < halfDim; j++ { freq := float64(pos) * invFreqs[j] sin, cos := math.Sincos(freq) idx0 := rowStart + headStart + j idx1 := rowStart + headStart + j + halfDim v0 := data[idx0] v1 := data[idx1] data[idx0] = v0*float32(cos) - v1*float32(sin) data[idx1] = v1*float32(cos) + v0*float32(sin) } } } return nil } // SwiGLU computes SiLU(gate) * up and stores in out. func SwiGLU(ctx *Context, gate, up, out tensor.Tensor) error { useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable() if !useGPU { profile.Start("SwiGLU/CPU") err := swigluCPU(gate, up, out) profile.End("SwiGLU/CPU") return err } profile.Start("SwiGLU/GPU") defer profile.End("SwiGLU/GPU") gCPU := gate.(*cpu.Tensor) uCPU := up.(*cpu.Tensor) oCPU := out.(*cpu.Tensor) gpu := ctx.Placement().GPU n := gate.Shape().NumElements() // Upload gate and up to GPU gpuGate, err := cuda.NewTensor(gate.Shape(), tensor.Float32, gpu) if err != nil { return err } gpuUp, err := cuda.NewTensor(up.Shape(), tensor.Float32, gpu) if err != nil { return err } if err := gpuGate.CopyFrom(gCPU.DataFloat32()); err != nil { return err } if err := gpuUp.CopyFrom(uCPU.DataFloat32()); err != nil { return err } // SiLU(gate) in-place profile.Start("SwiGLU/silu_kernel") if err := cuda.SiLU(gpuGate.Data().(unsafe.Pointer), n, gpu); err != nil { profile.End("SwiGLU/silu_kernel") return err } profile.End("SwiGLU/silu_kernel") // gate = gate * up profile.Start("SwiGLU/mul_kernel") if err := cuda.MulInplace(gpuGate.Data().(unsafe.Pointer), gpuUp.Data().(unsafe.Pointer), n, gpu); err != nil { profile.End("SwiGLU/mul_kernel") return err } profile.End("SwiGLU/mul_kernel") // Copy to output if err := gpuGate.CopyToHost(oCPU.DataFloat32()); err != nil { return err } return nil } func swigluCPU(gate, up, out tensor.Tensor) error { gCPU := gate.(*cpu.Tensor) uCPU := up.(*cpu.Tensor) oCPU := out.(*cpu.Tensor) gData := gCPU.DataFloat32() uData := uCPU.DataFloat32() oData := oCPU.DataFloat32() for i := range gData { gv := gData[i] silu := gv / (1.0 + float32(math.Exp(float64(-gv)))) oData[i] = silu * uData[i] } return nil } // Softmax applies softmax along the last dimension. func Softmax(ctx *Context, x tensor.Tensor) error { xCPU := x.(*cpu.Tensor) return softmaxCPU(xCPU) } func softmaxCPU(x *cpu.Tensor) error { data := x.DataFloat32() shape := x.Shape() if len(shape) != 2 { return fmt.Errorf("softmax: expected 2D tensor") } rows := shape[0] cols := shape[1] for i := 0; i < rows; i++ { row := data[i*cols : (i+1)*cols] maxVal := row[0] for _, v := range row[1:] { if v > maxVal { maxVal = v } } sum := float32(0) for j := range row { row[j] = float32(math.Exp(float64(row[j] - maxVal))) sum += row[j] } for j := range row { row[j] /= sum } } return nil } // Add performs element-wise addition: dst += src func Add(dst, src tensor.Tensor) error { dCPU := dst.(*cpu.Tensor) sCPU := src.(*cpu.Tensor) dData := dCPU.DataFloat32() sData := sCPU.DataFloat32() for i := range dData { dData[i] += sData[i] } return nil } // CopyData copies tensor data: dst = src func CopyData(dst, src tensor.Tensor) error { dCPU := dst.(*cpu.Tensor) sCPU := src.(*cpu.Tensor) copy(dCPU.DataFloat32(), sCPU.DataFloat32()) return nil }