// Package compute provides device-agnostic computation with hybrid CPU/GPU support.
package compute

import (
	"fmt"

	"makarna/pkg/backend/cpu"
	"makarna/pkg/backend/cuda"
	"makarna/pkg/backend/device"
	"makarna/pkg/tensor"
)

// Activation wraps a tensor with device tracking.
// It enables efficient hybrid execution where transfers only happen
// when crossing device boundaries.
type Activation struct {
	tensor    tensor.Tensor
	placement tensor.DevicePlacement
}

// NewActivation creates an activation on the specified device.
func NewActivation(shape tensor.Shape, placement tensor.DevicePlacement) (*Activation, error) {
	var t tensor.Tensor
	var err error

	if placement.Type == tensor.CUDA && device.CUDAAvailable() {
		t, err = cuda.NewTensor(shape, tensor.Float32, placement.GPU)
		if err != nil {
			// Fallback to CPU
			t = cpu.NewTensor(shape, nil)
			placement = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
		}
	} else {
		t = cpu.NewTensor(shape, nil)
		placement = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
	}

	return &Activation{tensor: t, placement: placement.Normalize()}, err
}

// NewActivationFrom wraps an existing tensor.
func NewActivationFrom(t tensor.Tensor) *Activation {
	var placement tensor.DevicePlacement

	if ct, ok := t.(*cuda.Tensor); ok {
		placement = tensor.DevicePlacement{Type: tensor.CUDA, GPU: ct.GPU()}
	} else {
		placement = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
	}

	return &Activation{tensor: t, placement: placement.Normalize()}
}

// Tensor returns the underlying tensor.
func (a *Activation) Tensor() tensor.Tensor {
	return a.tensor
}

// Placement returns the current device placement.
func (a *Activation) Placement() tensor.DevicePlacement {
	return a.placement
}

// IsGPU returns true if the activation is on GPU.
func (a *Activation) IsGPU() bool {
	return a.placement.Type == tensor.CUDA
}

// Shape returns the tensor shape.
func (a *Activation) Shape() tensor.Shape {
	return a.tensor.Shape()
}

// EnsureOn moves the activation to the target device if needed.
// Returns true if a transfer occurred.
func (a *Activation) EnsureOn(target tensor.DevicePlacement) (transferred bool, err error) {
	target = target.Normalize()

	// Already on target device
	if a.placement == target {
		return false, nil
	}

	// Transfer needed
	newTensor, err := device.EnsureOn(a.tensor, target)
	if err != nil {
		return false, fmt.Errorf("activation transfer %v -> %v: %w", a.placement, target, err)
	}

	// Free old GPU tensor to prevent memory leak
	if oldCT, ok := a.tensor.(*cuda.Tensor); ok && oldCT != nil {
		oldCT.Free()
	}

	a.tensor = newTensor
	a.placement = target
	return true, nil
}

// AsCPU returns the tensor as *cpu.Tensor, transferring if needed.
func (a *Activation) AsCPU() (*cpu.Tensor, error) {
	if _, err := a.EnsureOn(tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}); err != nil {
		return nil, err
	}
	return a.tensor.(*cpu.Tensor), nil
}

// AsCUDA returns the tensor as *cuda.Tensor, transferring if needed.
func (a *Activation) AsCUDA(gpu int) (*cuda.Tensor, error) {
	if _, err := a.EnsureOn(tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu}); err != nil {
		return nil, err
	}
	return a.tensor.(*cuda.Tensor), nil
}

// ReplaceWith replaces the underlying tensor and updates placement.
func (a *Activation) ReplaceWith(t tensor.Tensor) {
	if a.tensor != nil {
		if oldCT, ok := a.tensor.(*cuda.Tensor); ok {
			if newCT, ok2 := t.(*cuda.Tensor); ok2 {
				if oldCT != newCT {
					oldCT.Free()
				}
			} else {
				oldCT.Free()
			}
		}
	}

	a.tensor = t
	if ct, ok := t.(*cuda.Tensor); ok {
		a.placement = tensor.DevicePlacement{Type: tensor.CUDA, GPU: ct.GPU()}
	} else {
		a.placement = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
	}
}

// Clone creates a deep copy of the activation on the same device.
func (a *Activation) Clone() (*Activation, error) {
	if a.IsGPU() {
		ct := a.tensor.(*cuda.Tensor)
		newT, err := cuda.NewTensor(ct.Shape(), ct.DType(), ct.GPU())
		if err != nil {
			return nil, err
		}
		// Copy GPU to GPU using CopyToHost then CopyFrom (simple path)
		tempBuf := make([]float32, ct.Shape().NumElements())
		if err := ct.CopyToHost(tempBuf); err != nil {
			return nil, err
		}
		if err := newT.CopyFrom(tempBuf); err != nil {
			return nil, err
		}
		return &Activation{tensor: newT, placement: a.placement}, nil
	}

	// CPU clone
	src := a.tensor.(*cpu.Tensor)
	dst := cpu.NewTensor(src.Shape(), nil)
	copy(dst.DataFloat32(), src.DataFloat32())
	return &Activation{tensor: dst, placement: a.placement}, nil
}

// CopyFrom copies data from a CPU tensor to this activation
func (a *Activation) CopyFrom(t *cpu.Tensor) error {
	if a.IsGPU() {
		return a.tensor.(*cuda.Tensor).CopyFrom(t.DataFloat32())
	}
	src := t.DataFloat32()
	dst := a.tensor.(*cpu.Tensor).DataFloat32()
	copy(dst, src)
	return nil
}

// FreeActivation frees GPU memory if the activation is on GPU.
// Safe to call on nil or CPU activations.
func FreeActivation(a *Activation) {
	if a == nil {
		return
	}
	if ct, ok := a.tensor.(*cuda.Tensor); ok && ct != nil {
		ct.Free()
	}
}