package matmul

import (
	"math/rand"
	"testing"
	"unsafe"

	"makarna/pkg/backend/cpu"
	"makarna/pkg/tensor"
)

func BenchmarkLinearF32Decode(b *testing.B) {
	// Simulate single-token decode: M=1, moderate N
	M, K, N := 1, 512, 1024
	in := make([]float32, M*K)
	for i := range in {
		in[i] = rand.Float32()
	}
	w := make([]float32, N*K)
	for i := range w {
		w[i] = rand.Float32()
	}
	out := make([]float32, M*N)

	inT := cpu.NewTensor(tensor.Shape{M, K}, in)
	wT := cpu.NewTensor(tensor.Shape{N, K}, w)
	outT := cpu.NewTensor(tensor.Shape{M, N}, out)

	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		Linear(inT, wT, outT)
	}
}

func BenchmarkLinearF32Prefill(b *testing.B) {
	// Prefill-style: larger M
	M, K, N := 32, 512, 1024
	in := make([]float32, M*K)
	for i := range in {
		in[i] = rand.Float32()
	}
	w := make([]float32, N*K)
	for i := range w {
		w[i] = rand.Float32()
	}
	out := make([]float32, M*N)

	inT := cpu.NewTensor(tensor.Shape{M, K}, in)
	wT := cpu.NewTensor(tensor.Shape{N, K}, w)
	outT := cpu.NewTensor(tensor.Shape{M, N}, out)

	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		Linear(inT, wT, outT)
	}
}

func BenchmarkLinearQ4KDecode(b *testing.B) {
	M, K, N := 1, 512, 1024 // K multiple of 256
	in := make([]float32, M*K)
	for i := range in {
		in[i] = rand.Float32()
	}
	weightBlocks := makeQ4Blocks(N, K)
	out := make([]float32, M*N)

	inT := cpu.NewTensor(tensor.Shape{M, K}, in)
	wT := makeQ4Tensor(b, tensor.Shape{N, K}, weightBlocks)
	outT := cpu.NewTensor(tensor.Shape{M, N}, out)

	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		Linear(inT, wT, outT)
	}
}

func BenchmarkLinearQ4KPrefill(b *testing.B) {
	M, K, N := 16, 512, 512
	in := make([]float32, M*K)
	for i := range in {
		in[i] = rand.Float32()
	}
	weightBlocks := makeQ4Blocks(N, K)
	out := make([]float32, M*N)

	inT := cpu.NewTensor(tensor.Shape{M, K}, in)
	wT := makeQ4Tensor(b, tensor.Shape{N, K}, weightBlocks)
	outT := cpu.NewTensor(tensor.Shape{M, N}, out)

	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		Linear(inT, wT, outT)
	}
}

func fillQ4Blocks(bs []tensor.BlockQ4_K) {
	const fp16One = uint16(0x3c00)
	for i := range bs {
		b := &bs[i]
		b.D = fp16One
		b.DMin = 0
		for j := range b.Scales {
			b.Scales[j] = 1
		}
		for j := range b.QS {
			b.QS[j] = 0x11 // two nibbles set to 1
		}
	}
}

func makeQ4Blocks(N, K int) []tensor.BlockQ4_K {
	blocksPerRow := K / tensor.QK_K
	weightBlocks := make([]tensor.BlockQ4_K, N*blocksPerRow)
	fillQ4Blocks(weightBlocks)
	return weightBlocks
}

func makeQ4Tensor(tb testing.TB, shape tensor.Shape, blocks []tensor.BlockQ4_K) *cpu.Tensor {
	if len(blocks) == 0 {
		t, err := cpu.NewTensorFromBytes(shape, tensor.Q4_K, []byte{})
		if err != nil {
			tb.Fatalf("makeQ4Tensor empty: %v", err)
		}
		return t
	}
	blockSize := int(unsafe.Sizeof(tensor.BlockQ4_K{}))
	buf := unsafe.Slice((*byte)(unsafe.Pointer(&blocks[0])), len(blocks)*blockSize)
	t, err := cpu.NewTensorFromBytes(shape, tensor.Q4_K, buf)
	if err != nil {
		tb.Fatalf("makeQ4Tensor: %v", err)
	}
	return t
}