linear_bench_test.go 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. package matmul
  2. import (
  3. "math/rand"
  4. "testing"
  5. "unsafe"
  6. "makarna/pkg/backend/cpu"
  7. "makarna/pkg/tensor"
  8. )
  9. func BenchmarkLinearF32Decode(b *testing.B) {
  10. // Simulate single-token decode: M=1, moderate N
  11. M, K, N := 1, 512, 1024
  12. in := make([]float32, M*K)
  13. for i := range in {
  14. in[i] = rand.Float32()
  15. }
  16. w := make([]float32, N*K)
  17. for i := range w {
  18. w[i] = rand.Float32()
  19. }
  20. out := make([]float32, M*N)
  21. inT := cpu.NewTensor(tensor.Shape{M, K}, in)
  22. wT := cpu.NewTensor(tensor.Shape{N, K}, w)
  23. outT := cpu.NewTensor(tensor.Shape{M, N}, out)
  24. b.ReportAllocs()
  25. b.ResetTimer()
  26. for i := 0; i < b.N; i++ {
  27. Linear(inT, wT, outT)
  28. }
  29. }
  30. func BenchmarkLinearF32Prefill(b *testing.B) {
  31. // Prefill-style: larger M
  32. M, K, N := 32, 512, 1024
  33. in := make([]float32, M*K)
  34. for i := range in {
  35. in[i] = rand.Float32()
  36. }
  37. w := make([]float32, N*K)
  38. for i := range w {
  39. w[i] = rand.Float32()
  40. }
  41. out := make([]float32, M*N)
  42. inT := cpu.NewTensor(tensor.Shape{M, K}, in)
  43. wT := cpu.NewTensor(tensor.Shape{N, K}, w)
  44. outT := cpu.NewTensor(tensor.Shape{M, N}, out)
  45. b.ReportAllocs()
  46. b.ResetTimer()
  47. for i := 0; i < b.N; i++ {
  48. Linear(inT, wT, outT)
  49. }
  50. }
  51. func BenchmarkLinearQ4KDecode(b *testing.B) {
  52. M, K, N := 1, 512, 1024 // K multiple of 256
  53. in := make([]float32, M*K)
  54. for i := range in {
  55. in[i] = rand.Float32()
  56. }
  57. weightBlocks := makeQ4Blocks(N, K)
  58. out := make([]float32, M*N)
  59. inT := cpu.NewTensor(tensor.Shape{M, K}, in)
  60. wT := makeQ4Tensor(b, tensor.Shape{N, K}, weightBlocks)
  61. outT := cpu.NewTensor(tensor.Shape{M, N}, out)
  62. b.ReportAllocs()
  63. b.ResetTimer()
  64. for i := 0; i < b.N; i++ {
  65. Linear(inT, wT, outT)
  66. }
  67. }
  68. func BenchmarkLinearQ4KPrefill(b *testing.B) {
  69. M, K, N := 16, 512, 512
  70. in := make([]float32, M*K)
  71. for i := range in {
  72. in[i] = rand.Float32()
  73. }
  74. weightBlocks := makeQ4Blocks(N, K)
  75. out := make([]float32, M*N)
  76. inT := cpu.NewTensor(tensor.Shape{M, K}, in)
  77. wT := makeQ4Tensor(b, tensor.Shape{N, K}, weightBlocks)
  78. outT := cpu.NewTensor(tensor.Shape{M, N}, out)
  79. b.ReportAllocs()
  80. b.ResetTimer()
  81. for i := 0; i < b.N; i++ {
  82. Linear(inT, wT, outT)
  83. }
  84. }
  85. func fillQ4Blocks(bs []tensor.BlockQ4_K) {
  86. const fp16One = uint16(0x3c00)
  87. for i := range bs {
  88. b := &bs[i]
  89. b.D = fp16One
  90. b.DMin = 0
  91. for j := range b.Scales {
  92. b.Scales[j] = 1
  93. }
  94. for j := range b.QS {
  95. b.QS[j] = 0x11 // two nibbles set to 1
  96. }
  97. }
  98. }
  99. func makeQ4Blocks(N, K int) []tensor.BlockQ4_K {
  100. blocksPerRow := K / tensor.QK_K
  101. weightBlocks := make([]tensor.BlockQ4_K, N*blocksPerRow)
  102. fillQ4Blocks(weightBlocks)
  103. return weightBlocks
  104. }
  105. func makeQ4Tensor(tb testing.TB, shape tensor.Shape, blocks []tensor.BlockQ4_K) *cpu.Tensor {
  106. if len(blocks) == 0 {
  107. t, err := cpu.NewTensorFromBytes(shape, tensor.Q4_K, []byte{})
  108. if err != nil {
  109. tb.Fatalf("makeQ4Tensor empty: %v", err)
  110. }
  111. return t
  112. }
  113. blockSize := int(unsafe.Sizeof(tensor.BlockQ4_K{}))
  114. buf := unsafe.Slice((*byte)(unsafe.Pointer(&blocks[0])), len(blocks)*blockSize)
  115. t, err := cpu.NewTensorFromBytes(shape, tensor.Q4_K, buf)
  116. if err != nil {
  117. tb.Fatalf("makeQ4Tensor: %v", err)
  118. }
  119. return t
  120. }