package compute

import (
	"testing"

	"makarna/pkg/backend/cpu"
	"makarna/pkg/backend/device"
	"makarna/pkg/tensor"
)

func TestLinearCPU(t *testing.T) {
	// Input: [2, 3] @ Weight: [4, 3] = Output: [2, 4]
	input := cpu.NewTensor(tensor.Shape{2, 3}, []float32{
		1, 2, 3,
		4, 5, 6,
	})
	weight := cpu.NewTensor(tensor.Shape{4, 3}, []float32{
		1, 0, 0,
		0, 1, 0,
		0, 0, 1,
		1, 1, 1,
	})
	output := cpu.NewTensor(tensor.Shape{2, 4}, nil)

	ctx := NewContext(nil, 0) // nil dispatcher = CPU

	if err := Linear(ctx, input, weight, output); err != nil {
		t.Fatalf("Linear failed: %v", err)
	}

	expected := []float32{
		1, 2, 3, 6,  // row 0: [1,2,3] dot each weight row
		4, 5, 6, 15, // row 1
	}

	outData := output.DataFloat32()
	for i, exp := range expected {
		if diff := outData[i] - exp; diff < -0.001 || diff > 0.001 {
			t.Errorf("output[%d] = %f, expected %f", i, outData[i], exp)
		}
	}
}

func TestRMSNorm(t *testing.T) {
	x := cpu.NewTensor(tensor.Shape{1, 4}, []float32{1, 2, 3, 4})
	w := cpu.NewTensor(tensor.Shape{4}, []float32{1, 1, 1, 1})

	ctx := NewContext(nil, 0)
	if err := RMSNorm(ctx, x, w, 1e-6); err != nil {
		t.Fatalf("RMSNorm failed: %v", err)
	}

	// Check output is normalized
	data := x.DataFloat32()
	var ss float32
	for _, v := range data {
		ss += v * v
	}
	rms := ss / 4

	// After RMSNorm, variance should be close to 1
	if rms < 0.9 || rms > 1.1 {
		t.Errorf("RMS after norm = %f, expected ~1.0", rms)
	}
}

func TestDeviceDispatcher(t *testing.T) {
	placements := []tensor.DevicePlacement{
		{Type: tensor.CUDA, GPU: 0},
		{Type: tensor.CUDA, GPU: 0},
		{Type: tensor.CPU, GPU: -1},
		{Type: tensor.CPU, GPU: -1},
	}

	dd := device.NewDeviceDispatcher(placements)

	if dd.NumGPULayers() != 2 {
		t.Errorf("NumGPULayers = %d, expected 2", dd.NumGPULayers())
	}

	if !dd.IsLayerOnGPU(0) {
		t.Error("Layer 0 should be on GPU")
	}

	if dd.IsLayerOnGPU(2) {
		t.Error("Layer 2 should be on CPU")
	}

	p := dd.LayerPlacement(1)
	if p.Type != tensor.CUDA {
		t.Errorf("Layer 1 placement = %v, expected CUDA", p.Type)
	}

	// Beyond bounds defaults to CPU
	p = dd.LayerPlacement(100)
	if p.Type != tensor.CPU {
		t.Errorf("Out of bounds placement = %v, expected CPU", p.Type)
	}
}

func TestContextPlacement(t *testing.T) {
	placements := []tensor.DevicePlacement{
		{Type: tensor.CUDA, GPU: 0},
		{Type: tensor.CPU, GPU: -1},
	}

	dd := device.NewDeviceDispatcher(placements)

	ctx0 := NewContext(dd, 0)
	if !ctx0.IsGPU() {
		t.Error("Context 0 should be GPU")
	}

	ctx1 := NewContext(dd, 1)
	if ctx1.IsGPU() {
		t.Error("Context 1 should be CPU")
	}

	// Nil dispatcher
	ctxNil := NewContext(nil, 0)
	if ctxNil.IsGPU() {
		t.Error("Nil dispatcher should default to CPU")
	}
}

func TestSwiGLU(t *testing.T) {
	gate := cpu.NewTensor(tensor.Shape{2}, []float32{0, 1})
	up := cpu.NewTensor(tensor.Shape{2}, []float32{2, 3})
	out := cpu.NewTensor(tensor.Shape{2}, nil)

	ctx := NewContext(nil, 0)
	if err := SwiGLU(ctx, gate, up, out); err != nil {
		t.Fatalf("SwiGLU failed: %v", err)
	}

	// SiLU(0) = 0, so out[0] = 0 * 2 = 0
	// SiLU(1) ≈ 0.731, so out[1] ≈ 0.731 * 3 ≈ 2.19
	data := out.DataFloat32()
	if data[0] != 0 {
		t.Errorf("out[0] = %f, expected 0", data[0])
	}
	if data[1] < 2.0 || data[1] > 2.5 {
		t.Errorf("out[1] = %f, expected ~2.2", data[1])
	}
}