| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- package matmul
- import (
- "math/rand"
- "testing"
- "unsafe"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/tensor"
- )
- func BenchmarkLinearF32Decode(b *testing.B) {
- // Simulate single-token decode: M=1, moderate N
- M, K, N := 1, 512, 1024
- in := make([]float32, M*K)
- for i := range in {
- in[i] = rand.Float32()
- }
- w := make([]float32, N*K)
- for i := range w {
- w[i] = rand.Float32()
- }
- out := make([]float32, M*N)
- inT := cpu.NewTensor(tensor.Shape{M, K}, in)
- wT := cpu.NewTensor(tensor.Shape{N, K}, w)
- outT := cpu.NewTensor(tensor.Shape{M, N}, out)
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- Linear(inT, wT, outT)
- }
- }
- func BenchmarkLinearF32Prefill(b *testing.B) {
- // Prefill-style: larger M
- M, K, N := 32, 512, 1024
- in := make([]float32, M*K)
- for i := range in {
- in[i] = rand.Float32()
- }
- w := make([]float32, N*K)
- for i := range w {
- w[i] = rand.Float32()
- }
- out := make([]float32, M*N)
- inT := cpu.NewTensor(tensor.Shape{M, K}, in)
- wT := cpu.NewTensor(tensor.Shape{N, K}, w)
- outT := cpu.NewTensor(tensor.Shape{M, N}, out)
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- Linear(inT, wT, outT)
- }
- }
- func BenchmarkLinearQ4KDecode(b *testing.B) {
- M, K, N := 1, 512, 1024 // K multiple of 256
- in := make([]float32, M*K)
- for i := range in {
- in[i] = rand.Float32()
- }
- weightBlocks := makeQ4Blocks(N, K)
- out := make([]float32, M*N)
- inT := cpu.NewTensor(tensor.Shape{M, K}, in)
- wT := makeQ4Tensor(b, tensor.Shape{N, K}, weightBlocks)
- outT := cpu.NewTensor(tensor.Shape{M, N}, out)
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- Linear(inT, wT, outT)
- }
- }
- func BenchmarkLinearQ4KPrefill(b *testing.B) {
- M, K, N := 16, 512, 512
- in := make([]float32, M*K)
- for i := range in {
- in[i] = rand.Float32()
- }
- weightBlocks := makeQ4Blocks(N, K)
- out := make([]float32, M*N)
- inT := cpu.NewTensor(tensor.Shape{M, K}, in)
- wT := makeQ4Tensor(b, tensor.Shape{N, K}, weightBlocks)
- outT := cpu.NewTensor(tensor.Shape{M, N}, out)
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- Linear(inT, wT, outT)
- }
- }
- func fillQ4Blocks(bs []tensor.BlockQ4_K) {
- const fp16One = uint16(0x3c00)
- for i := range bs {
- b := &bs[i]
- b.D = fp16One
- b.DMin = 0
- for j := range b.Scales {
- b.Scales[j] = 1
- }
- for j := range b.QS {
- b.QS[j] = 0x11 // two nibbles set to 1
- }
- }
- }
- func makeQ4Blocks(N, K int) []tensor.BlockQ4_K {
- blocksPerRow := K / tensor.QK_K
- weightBlocks := make([]tensor.BlockQ4_K, N*blocksPerRow)
- fillQ4Blocks(weightBlocks)
- return weightBlocks
- }
- func makeQ4Tensor(tb testing.TB, shape tensor.Shape, blocks []tensor.BlockQ4_K) *cpu.Tensor {
- if len(blocks) == 0 {
- t, err := cpu.NewTensorFromBytes(shape, tensor.Q4_K, []byte{})
- if err != nil {
- tb.Fatalf("makeQ4Tensor empty: %v", err)
- }
- return t
- }
- blockSize := int(unsafe.Sizeof(tensor.BlockQ4_K{}))
- buf := unsafe.Slice((*byte)(unsafe.Pointer(&blocks[0])), len(blocks)*blockSize)
- t, err := cpu.NewTensorFromBytes(shape, tensor.Q4_K, buf)
- if err != nil {
- tb.Fatalf("makeQ4Tensor: %v", err)
- }
- return t
- }
|