package matmul import ( "math/rand" "testing" "unsafe" "makarna/pkg/backend/cpu" "makarna/pkg/tensor" ) func BenchmarkLinearF32Decode(b *testing.B) { // Simulate single-token decode: M=1, moderate N M, K, N := 1, 512, 1024 in := make([]float32, M*K) for i := range in { in[i] = rand.Float32() } w := make([]float32, N*K) for i := range w { w[i] = rand.Float32() } out := make([]float32, M*N) inT := cpu.NewTensor(tensor.Shape{M, K}, in) wT := cpu.NewTensor(tensor.Shape{N, K}, w) outT := cpu.NewTensor(tensor.Shape{M, N}, out) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { Linear(inT, wT, outT) } } func BenchmarkLinearF32Prefill(b *testing.B) { // Prefill-style: larger M M, K, N := 32, 512, 1024 in := make([]float32, M*K) for i := range in { in[i] = rand.Float32() } w := make([]float32, N*K) for i := range w { w[i] = rand.Float32() } out := make([]float32, M*N) inT := cpu.NewTensor(tensor.Shape{M, K}, in) wT := cpu.NewTensor(tensor.Shape{N, K}, w) outT := cpu.NewTensor(tensor.Shape{M, N}, out) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { Linear(inT, wT, outT) } } func BenchmarkLinearQ4KDecode(b *testing.B) { M, K, N := 1, 512, 1024 // K multiple of 256 in := make([]float32, M*K) for i := range in { in[i] = rand.Float32() } weightBlocks := makeQ4Blocks(N, K) out := make([]float32, M*N) inT := cpu.NewTensor(tensor.Shape{M, K}, in) wT := makeQ4Tensor(b, tensor.Shape{N, K}, weightBlocks) outT := cpu.NewTensor(tensor.Shape{M, N}, out) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { Linear(inT, wT, outT) } } func BenchmarkLinearQ4KPrefill(b *testing.B) { M, K, N := 16, 512, 512 in := make([]float32, M*K) for i := range in { in[i] = rand.Float32() } weightBlocks := makeQ4Blocks(N, K) out := make([]float32, M*N) inT := cpu.NewTensor(tensor.Shape{M, K}, in) wT := makeQ4Tensor(b, tensor.Shape{N, K}, weightBlocks) outT := cpu.NewTensor(tensor.Shape{M, N}, out) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { Linear(inT, wT, outT) } } func fillQ4Blocks(bs []tensor.BlockQ4_K) { const fp16One = uint16(0x3c00) for i := range bs { b := &bs[i] b.D = fp16One b.DMin = 0 for j := range b.Scales { b.Scales[j] = 1 } for j := range b.QS { b.QS[j] = 0x11 // two nibbles set to 1 } } } func makeQ4Blocks(N, K int) []tensor.BlockQ4_K { blocksPerRow := K / tensor.QK_K weightBlocks := make([]tensor.BlockQ4_K, N*blocksPerRow) fillQ4Blocks(weightBlocks) return weightBlocks } func makeQ4Tensor(tb testing.TB, shape tensor.Shape, blocks []tensor.BlockQ4_K) *cpu.Tensor { if len(blocks) == 0 { t, err := cpu.NewTensorFromBytes(shape, tensor.Q4_K, []byte{}) if err != nil { tb.Fatalf("makeQ4Tensor empty: %v", err) } return t } blockSize := int(unsafe.Sizeof(tensor.BlockQ4_K{})) buf := unsafe.Slice((*byte)(unsafe.Pointer(&blocks[0])), len(blocks)*blockSize) t, err := cpu.NewTensorFromBytes(shape, tensor.Q4_K, buf) if err != nil { tb.Fatalf("makeQ4Tensor: %v", err) } return t }