| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340 |
- //go:build cuda
- // Package compute provides CUDA-accelerated neural network operations.
- package compute
- import (
- "fmt"
- "math"
- "unsafe"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/backend/cuda"
- "makarna/pkg/backend/device"
- "makarna/pkg/profile"
- "makarna/pkg/tensor"
- )
- // RMSNorm applies RMS normalization in-place.
- // Uses CUDA when on GPU layer.
- func RMSNorm(ctx *Context, x, w tensor.Tensor, eps float32) error {
- useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable()
- if !useGPU {
- profile.Start("RMSNorm/CPU")
- err := rmsNormCPU(x, w, eps)
- profile.End("RMSNorm/CPU")
- return err
- }
- profile.Start("RMSNorm/GPU")
- defer profile.End("RMSNorm/GPU")
- // Get GPU pointers
- xCPU, ok := x.(*cpu.Tensor)
- if !ok {
- return fmt.Errorf("rmsnorm: x must be CPU tensor")
- }
- wCPU, ok := w.(*cpu.Tensor)
- if !ok {
- return fmt.Errorf("rmsnorm: w must be CPU tensor")
- }
- gpu := ctx.Placement().GPU
- shape := x.Shape()
- seqLen := shape[0]
- dim := shape[1]
- // Upload x to GPU
- profile.Start("RMSNorm/alloc_x")
- gpuX, err := cuda.NewTensor(shape, tensor.Float32, gpu)
- profile.End("RMSNorm/alloc_x")
- if err != nil {
- return err
- }
- if err := gpuX.CopyFrom(xCPU.DataFloat32()); err != nil {
- return err
- }
- // Get or upload weight
- cache := GetWeightCache(gpu)
- wKey := fmt.Sprintf("rmsnorm_%p", wCPU)
- gpuW, ok := cache.Get(wKey)
- if !ok {
- gpuW, err = cache.Upload(wKey, wCPU)
- if err != nil {
- return err
- }
- }
- // Run CUDA RMSNorm
- profile.Start("RMSNorm/kernel")
- err = cuda.RMSNorm(gpuX.Data().(unsafe.Pointer), gpuW, seqLen, dim, eps, gpu)
- profile.End("RMSNorm/kernel")
- if err != nil {
- return err
- }
- // Copy back
- if err := gpuX.CopyToHost(xCPU.DataFloat32()); err != nil {
- return err
- }
- return nil
- }
- func rmsNormCPU(x, w tensor.Tensor, eps float32) error {
- xCPU := x.(*cpu.Tensor)
- wCPU := w.(*cpu.Tensor)
- xData := xCPU.DataFloat32()
- wData := wCPU.DataFloat32()
- dim := wCPU.Shape().NumElements()
- numRows := xCPU.Shape().NumElements() / dim
- for i := 0; i < numRows; i++ {
- row := xData[i*dim : (i+1)*dim]
- ss := cpu.DotFloat32(row, row) / float32(dim)
- invRMS := 1.0 / float32(math.Sqrt(float64(ss+eps)))
- for j := 0; j < dim; j++ {
- row[j] = row[j] * invRMS * wData[j]
- }
- }
- return nil
- }
- // RoPE applies Rotary Positional Embeddings in-place.
- func RoPE(ctx *Context, x tensor.Tensor, positions []int, headDim int, theta float32) error {
- useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable()
- if !useGPU {
- profile.Start("RoPE/CPU")
- err := ropeCPU(x, positions, headDim, theta)
- profile.End("RoPE/CPU")
- return err
- }
- profile.Start("RoPE/GPU")
- defer profile.End("RoPE/GPU")
- xCPU := x.(*cpu.Tensor)
- gpu := ctx.Placement().GPU
- shape := x.Shape()
- seqLen := shape[0]
- totalDim := shape[1]
- numHeads := totalDim / headDim
- // Upload x to GPU
- gpuX, err := cuda.NewTensor(shape, tensor.Float32, gpu)
- if err != nil {
- return err
- }
- if err := gpuX.CopyFrom(xCPU.DataFloat32()); err != nil {
- return err
- }
- // Upload positions
- posData := make([]int32, len(positions))
- for i, p := range positions {
- posData[i] = int32(p)
- }
- gpuPos, err := cuda.NewTensor(tensor.Shape{len(positions)}, tensor.Float32, gpu) // Reuse for int alloc
- if err != nil {
- return err
- }
- // Manual copy as int32
- posPtr := unsafe.Pointer(&posData[0])
- if err := gpuPos.CopyFrom(unsafe.Slice((*float32)(posPtr), len(posData))); err != nil {
- return err
- }
- // Run CUDA RoPE
- profile.Start("RoPE/kernel")
- err = cuda.RoPE(gpuX.Data().(unsafe.Pointer), gpuPos.Data().(unsafe.Pointer), seqLen, numHeads, headDim, theta, gpu)
- profile.End("RoPE/kernel")
- if err != nil {
- return err
- }
- // Copy back
- if err := gpuX.CopyToHost(xCPU.DataFloat32()); err != nil {
- return err
- }
- return nil
- }
- func ropeCPU(x tensor.Tensor, positions []int, headDim int, theta float32) error {
- xCPU := x.(*cpu.Tensor)
- data := xCPU.DataFloat32()
- shape := x.Shape()
- seqLen := shape[0]
- totalDim := shape[1]
- halfDim := headDim / 2
- invFreqs := make([]float64, halfDim)
- for j := 0; j < halfDim; j++ {
- invFreqs[j] = 1.0 / math.Pow(float64(theta), float64(2*j)/float64(headDim))
- }
- for seq := 0; seq < seqLen; seq++ {
- pos := positions[seq]
- rowStart := seq * totalDim
- for headStart := 0; headStart < totalDim; headStart += headDim {
- for j := 0; j < halfDim; j++ {
- freq := float64(pos) * invFreqs[j]
- sin, cos := math.Sincos(freq)
- idx0 := rowStart + headStart + j
- idx1 := rowStart + headStart + j + halfDim
- v0 := data[idx0]
- v1 := data[idx1]
- data[idx0] = v0*float32(cos) - v1*float32(sin)
- data[idx1] = v1*float32(cos) + v0*float32(sin)
- }
- }
- }
- return nil
- }
- // SwiGLU computes SiLU(gate) * up and stores in out.
- func SwiGLU(ctx *Context, gate, up, out tensor.Tensor) error {
- useGPU := ctx != nil && ctx.IsGPU() && device.CUDAAvailable()
- if !useGPU {
- profile.Start("SwiGLU/CPU")
- err := swigluCPU(gate, up, out)
- profile.End("SwiGLU/CPU")
- return err
- }
- profile.Start("SwiGLU/GPU")
- defer profile.End("SwiGLU/GPU")
- gCPU := gate.(*cpu.Tensor)
- uCPU := up.(*cpu.Tensor)
- oCPU := out.(*cpu.Tensor)
- gpu := ctx.Placement().GPU
- n := gate.Shape().NumElements()
- // Upload gate and up to GPU
- gpuGate, err := cuda.NewTensor(gate.Shape(), tensor.Float32, gpu)
- if err != nil {
- return err
- }
- gpuUp, err := cuda.NewTensor(up.Shape(), tensor.Float32, gpu)
- if err != nil {
- return err
- }
- if err := gpuGate.CopyFrom(gCPU.DataFloat32()); err != nil {
- return err
- }
- if err := gpuUp.CopyFrom(uCPU.DataFloat32()); err != nil {
- return err
- }
- // SiLU(gate) in-place
- profile.Start("SwiGLU/silu_kernel")
- if err := cuda.SiLU(gpuGate.Data().(unsafe.Pointer), n, gpu); err != nil {
- profile.End("SwiGLU/silu_kernel")
- return err
- }
- profile.End("SwiGLU/silu_kernel")
- // gate = gate * up
- profile.Start("SwiGLU/mul_kernel")
- if err := cuda.MulInplace(gpuGate.Data().(unsafe.Pointer), gpuUp.Data().(unsafe.Pointer), n, gpu); err != nil {
- profile.End("SwiGLU/mul_kernel")
- return err
- }
- profile.End("SwiGLU/mul_kernel")
- // Copy to output
- if err := gpuGate.CopyToHost(oCPU.DataFloat32()); err != nil {
- return err
- }
- return nil
- }
- func swigluCPU(gate, up, out tensor.Tensor) error {
- gCPU := gate.(*cpu.Tensor)
- uCPU := up.(*cpu.Tensor)
- oCPU := out.(*cpu.Tensor)
- gData := gCPU.DataFloat32()
- uData := uCPU.DataFloat32()
- oData := oCPU.DataFloat32()
- for i := range gData {
- gv := gData[i]
- silu := gv / (1.0 + float32(math.Exp(float64(-gv))))
- oData[i] = silu * uData[i]
- }
- return nil
- }
- // Softmax applies softmax along the last dimension.
- func Softmax(ctx *Context, x tensor.Tensor) error {
- xCPU := x.(*cpu.Tensor)
- return softmaxCPU(xCPU)
- }
- func softmaxCPU(x *cpu.Tensor) error {
- data := x.DataFloat32()
- shape := x.Shape()
- if len(shape) != 2 {
- return fmt.Errorf("softmax: expected 2D tensor")
- }
- rows := shape[0]
- cols := shape[1]
- for i := 0; i < rows; i++ {
- row := data[i*cols : (i+1)*cols]
- maxVal := row[0]
- for _, v := range row[1:] {
- if v > maxVal {
- maxVal = v
- }
- }
- sum := float32(0)
- for j := range row {
- row[j] = float32(math.Exp(float64(row[j] - maxVal)))
- sum += row[j]
- }
- for j := range row {
- row[j] /= sum
- }
- }
- return nil
- }
- // Add performs element-wise addition: dst += src
- func Add(dst, src tensor.Tensor) error {
- dCPU := dst.(*cpu.Tensor)
- sCPU := src.(*cpu.Tensor)
- dData := dCPU.DataFloat32()
- sData := sCPU.DataFloat32()
- for i := range dData {
- dData[i] += sData[i]
- }
- return nil
- }
- // CopyData copies tensor data: dst = src
- func CopyData(dst, src tensor.Tensor) error {
- dCPU := dst.(*cpu.Tensor)
- sCPU := src.(*cpu.Tensor)
- copy(dCPU.DataFloat32(), sCPU.DataFloat32())
- return nil
- }
|