// Package compute provides device-agnostic computation dispatching.
// Operations automatically route to the appropriate backend (CPU/CUDA)
// based on tensor placement, eliminating manual device management in model code.
package compute

import (
	"fmt"

	"makarna/pkg/backend/cpu"
	"makarna/pkg/backend/cuda"
	"makarna/pkg/backend/device"
	"makarna/pkg/tensor"
)

// Context holds computation state for a forward pass.
type Context struct {
	Dispatcher *device.DeviceDispatcher
	LayerIdx   int
	Scratch    *ScratchSpace
	CPUMoE     bool // Keep MoE expert weights on CPU
}

// NewContext creates a computation context.
func NewContext(dispatcher *device.DeviceDispatcher, layerIdx int) *Context {
	return &Context{
		Dispatcher: dispatcher,
		LayerIdx:   layerIdx,
	}
}

// Placement returns the current layer's device placement.
func (c *Context) Placement() tensor.DevicePlacement {
	if c.Dispatcher == nil {
		return tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
	}
	return c.Dispatcher.LayerPlacement(c.LayerIdx)
}

// IsGPU returns true if current layer is on GPU.
func (c *Context) IsGPU() bool {
	return c.Placement().Type == tensor.CUDA
}

// EnsureWeight ensures a weight tensor is on the correct device with caching.
func (c *Context) EnsureWeight(t tensor.Tensor, name string) (tensor.Tensor, error) {
	if c.Dispatcher == nil {
		return t, nil
	}

	placement := c.Placement()
	if placement.Type == tensor.CPU {
		return t, nil
	}

	cache := c.Dispatcher.GetWeightCache(placement.GPU)
	key := fmt.Sprintf("%d:%s", c.LayerIdx, name)
	return device.EnsureOnCached(t, placement, cache, key)
}

// EnsureActivation ensures an activation tensor is on the correct device.
// Unlike weights, activations are not cached between forward passes.
func (c *Context) EnsureActivation(t tensor.Tensor) (tensor.Tensor, error) {
	if c.Dispatcher == nil {
		return t, nil
	}
	return device.EnsureOn(t, c.Placement())
}

// Zeros creates a zero tensor on the appropriate device.
func Zeros(ctx *Context, shape tensor.Shape) tensor.Tensor {
	if ctx == nil || !ctx.IsGPU() || !device.CUDAAvailable() {
		return cpu.NewTensor(shape, nil)
	}

	t, err := cuda.NewTensor(shape, tensor.Float32, ctx.Placement().GPU)
	if err != nil {
		// Fallback to CPU
		return cpu.NewTensor(shape, nil)
	}
	return t
}

// ZerosCPU always creates a CPU tensor (for inputs/outputs).
func ZerosCPU(shape tensor.Shape) *cpu.Tensor {
	return cpu.NewTensor(shape, nil)
}

// ToCPU copies a tensor to CPU if needed.
func ToCPU(t tensor.Tensor) (*cpu.Tensor, error) {
	if cpuT, ok := t.(*cpu.Tensor); ok {
		return cpuT, nil
	}

	result, err := device.EnsureOn(t, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
	if err != nil {
		return nil, err
	}
	return result.(*cpu.Tensor), nil
}

// Copy copies data between tensors, handling cross-device copies.
func Copy(dst, src tensor.Tensor) error {
	// Same device, same type
	if dstCPU, ok := dst.(*cpu.Tensor); ok {
		if srcCPU, ok := src.(*cpu.Tensor); ok {
			copy(dstCPU.DataFloat32(), srcCPU.DataFloat32())
			return nil
		}
	}

	if dstCUDA, ok := dst.(*cuda.Tensor); ok {
		if srcCUDA, ok := src.(*cuda.Tensor); ok {
			// TODO: CUDA-to-CUDA copy kernel
			_ = dstCUDA
			_ = srcCUDA
			return fmt.Errorf("CUDA-to-CUDA copy not implemented")
		}
	}

	// Cross-device: need intermediate copy
	return fmt.Errorf("cross-device copy requires explicit conversion")
}