// Package compute provides device-agnostic computation dispatching. // Operations automatically route to the appropriate backend (CPU/CUDA) // based on tensor placement, eliminating manual device management in model code. package compute import ( "fmt" "makarna/pkg/backend/cpu" "makarna/pkg/backend/cuda" "makarna/pkg/backend/device" "makarna/pkg/tensor" ) // Context holds computation state for a forward pass. type Context struct { Dispatcher *device.DeviceDispatcher LayerIdx int Scratch *ScratchSpace CPUMoE bool // Keep MoE expert weights on CPU } // NewContext creates a computation context. func NewContext(dispatcher *device.DeviceDispatcher, layerIdx int) *Context { return &Context{ Dispatcher: dispatcher, LayerIdx: layerIdx, } } // Placement returns the current layer's device placement. func (c *Context) Placement() tensor.DevicePlacement { if c.Dispatcher == nil { return tensor.DevicePlacement{Type: tensor.CPU, GPU: -1} } return c.Dispatcher.LayerPlacement(c.LayerIdx) } // IsGPU returns true if current layer is on GPU. func (c *Context) IsGPU() bool { return c.Placement().Type == tensor.CUDA } // EnsureWeight ensures a weight tensor is on the correct device with caching. func (c *Context) EnsureWeight(t tensor.Tensor, name string) (tensor.Tensor, error) { if c.Dispatcher == nil { return t, nil } placement := c.Placement() if placement.Type == tensor.CPU { return t, nil } cache := c.Dispatcher.GetWeightCache(placement.GPU) key := fmt.Sprintf("%d:%s", c.LayerIdx, name) return device.EnsureOnCached(t, placement, cache, key) } // EnsureActivation ensures an activation tensor is on the correct device. // Unlike weights, activations are not cached between forward passes. func (c *Context) EnsureActivation(t tensor.Tensor) (tensor.Tensor, error) { if c.Dispatcher == nil { return t, nil } return device.EnsureOn(t, c.Placement()) } // Zeros creates a zero tensor on the appropriate device. func Zeros(ctx *Context, shape tensor.Shape) tensor.Tensor { if ctx == nil || !ctx.IsGPU() || !device.CUDAAvailable() { return cpu.NewTensor(shape, nil) } t, err := cuda.NewTensor(shape, tensor.Float32, ctx.Placement().GPU) if err != nil { // Fallback to CPU return cpu.NewTensor(shape, nil) } return t } // ZerosCPU always creates a CPU tensor (for inputs/outputs). func ZerosCPU(shape tensor.Shape) *cpu.Tensor { return cpu.NewTensor(shape, nil) } // ToCPU copies a tensor to CPU if needed. func ToCPU(t tensor.Tensor) (*cpu.Tensor, error) { if cpuT, ok := t.(*cpu.Tensor); ok { return cpuT, nil } result, err := device.EnsureOn(t, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}) if err != nil { return nil, err } return result.(*cpu.Tensor), nil } // Copy copies data between tensors, handling cross-device copies. func Copy(dst, src tensor.Tensor) error { // Same device, same type if dstCPU, ok := dst.(*cpu.Tensor); ok { if srcCPU, ok := src.(*cpu.Tensor); ok { copy(dstCPU.DataFloat32(), srcCPU.DataFloat32()) return nil } } if dstCUDA, ok := dst.(*cuda.Tensor); ok { if srcCUDA, ok := src.(*cuda.Tensor); ok { // TODO: CUDA-to-CUDA copy kernel _ = dstCUDA _ = srcCUDA return fmt.Errorf("CUDA-to-CUDA copy not implemented") } } // Cross-device: need intermediate copy return fmt.Errorf("cross-device copy requires explicit conversion") }