1
0

nn_bench_test.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. package nn
  2. import (
  3. "math/rand"
  4. "strconv"
  5. "testing"
  6. "makarna/pkg/backend/cpu"
  7. "makarna/pkg/kvcache"
  8. "makarna/pkg/tensor"
  9. )
  10. func BenchmarkRMSNorm(b *testing.B) {
  11. dim := 256
  12. rows := 16
  13. data := make([]float32, rows*dim)
  14. for i := range data {
  15. data[i] = rand.Float32()
  16. }
  17. w := make([]float32, dim)
  18. for i := range w {
  19. w[i] = 1
  20. }
  21. x := cpu.NewTensor(tensor.Shape{rows, dim}, data)
  22. ws := cpu.NewTensor(tensor.Shape{dim}, w)
  23. b.ReportAllocs()
  24. b.ResetTimer()
  25. for i := 0; i < b.N; i++ {
  26. RMSNorm(x, ws, 1e-5)
  27. }
  28. }
  29. func BenchmarkCausalAttentionPackedVsBlocks(b *testing.B) {
  30. // Decode-like: newTokens=1, numHeads=16, numKVHeads=8, headDim=128.
  31. numHeads, numKVHeads, headDim := 16, 8, 128
  32. newTokens := 1
  33. kvDim := numKVHeads * headDim
  34. blockSize := 16
  35. x := make([]float32, newTokens*numHeads*headDim)
  36. for i := range x {
  37. x[i] = rand.Float32() - 0.5
  38. }
  39. q := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, x)
  40. out := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, nil)
  41. for _, kvLen := range []int{256, 1024, 4096, 16384} {
  42. b.Run("kvLen="+strconv.Itoa(kvLen), func(b *testing.B) {
  43. kData := make([]float32, kvLen*kvDim)
  44. vData := make([]float32, kvLen*kvDim)
  45. for i := range kData {
  46. kData[i] = rand.Float32() - 0.5
  47. }
  48. for i := range vData {
  49. vData[i] = rand.Float32() - 0.5
  50. }
  51. // Build both view types over the same data.
  52. views := make([]kvcache.View, 0, (kvLen+blockSize-1)/blockSize)
  53. pviews := make([]kvcache.PackedView, 0, (kvLen+blockSize-1)/blockSize)
  54. for start := 0; start < kvLen; start += blockSize {
  55. length := blockSize
  56. if start+length > kvLen {
  57. length = kvLen - start
  58. }
  59. // token-major blocks
  60. kBlk := cpu.NewTensor(tensor.Shape{blockSize, kvDim}, kData[start*kvDim:(start+blockSize)*kvDim])
  61. vBlk := cpu.NewTensor(tensor.Shape{blockSize, kvDim}, vData[start*kvDim:(start+blockSize)*kvDim])
  62. views = append(views, kvcache.View{K: kBlk, V: vBlk, Start: start, Length: length, Device: tensor.CPU})
  63. // packed blocks
  64. pk := make([]float32, numKVHeads*blockSize*headDim)
  65. pv := make([]float32, numKVHeads*blockSize*headDim)
  66. // write packed
  67. for t := 0; t < length; t++ {
  68. baseTok := (start + t) * kvDim
  69. for h := 0; h < numKVHeads; h++ {
  70. srcBase := baseTok + h*headDim
  71. dstBase := h*(blockSize*headDim) + t*headDim
  72. copy(pk[dstBase:dstBase+headDim], kData[srcBase:srcBase+headDim])
  73. copy(pv[dstBase:dstBase+headDim], vData[srcBase:srcBase+headDim])
  74. }
  75. }
  76. pviews = append(pviews, kvcache.PackedView{K: pk, V: pv, Start: start, Length: length, BlockSize: blockSize, HeadDim: headDim, NumKVHeads: numKVHeads})
  77. }
  78. for _, fast := range []bool{false, true} {
  79. name := "exp=exact"
  80. if fast {
  81. name = "exp=fast"
  82. }
  83. b.Run(name, func(b *testing.B) {
  84. orig := useFastExp
  85. useFastExp = fast
  86. defer func() { useFastExp = orig }()
  87. b.Run("blocks", func(b *testing.B) {
  88. b.ReportAllocs()
  89. b.ResetTimer()
  90. for i := 0; i < b.N; i++ {
  91. _ = CausalAttentionBlocks(q, views, out, numHeads, numKVHeads, headDim, kvLen-1)
  92. }
  93. })
  94. b.Run("packed", func(b *testing.B) {
  95. b.ReportAllocs()
  96. b.ResetTimer()
  97. for i := 0; i < b.N; i++ {
  98. _ = CausalAttentionPackedBlocks(q, pviews, out, numHeads, numKVHeads, headDim, kvLen-1)
  99. }
  100. })
  101. })
  102. }
  103. })
  104. }
  105. }
  106. func BenchmarkSoftmax(b *testing.B) {
  107. n := 512
  108. data := make([]float32, n)
  109. for i := range data {
  110. data[i] = rand.Float32()
  111. }
  112. x := cpu.NewTensor(tensor.Shape{n}, data)
  113. b.ReportAllocs()
  114. b.ResetTimer()
  115. for i := 0; i < b.N; i++ {
  116. Softmax(x)
  117. }
  118. }
  119. func BenchmarkCausalAttentionCached_Fused(b *testing.B) {
  120. // Decode-like: newTokens=1, numHeads=16, numKVHeads=8, headDim=128.
  121. numHeads, numKVHeads, headDim := 16, 8, 128
  122. newTokens := 1
  123. kvDim := numKVHeads * headDim
  124. x := make([]float32, newTokens*numHeads*headDim)
  125. for i := range x {
  126. x[i] = rand.Float32() - 0.5
  127. }
  128. q := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, x)
  129. for _, kvLen := range []int{256, 1024, 4096, 16384} {
  130. b.Run("kvLen="+strconv.Itoa(kvLen), func(b *testing.B) {
  131. kData := make([]float32, kvLen*kvDim)
  132. vData := make([]float32, kvLen*kvDim)
  133. for i := range kData {
  134. kData[i] = rand.Float32() - 0.5
  135. }
  136. for i := range vData {
  137. vData[i] = rand.Float32() - 0.5
  138. }
  139. k := cpu.NewTensor(tensor.Shape{kvLen, kvDim}, kData)
  140. v := cpu.NewTensor(tensor.Shape{kvLen, kvDim}, vData)
  141. out := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, nil)
  142. b.ReportAllocs()
  143. b.ResetTimer()
  144. for i := 0; i < b.N; i++ {
  145. _ = CausalAttentionCached(q, k, v, out, numHeads, numKVHeads, headDim, kvLen-1)
  146. }
  147. })
  148. }
  149. }