| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168 |
- package nn
- import (
- "math/rand"
- "strconv"
- "testing"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/kvcache"
- "makarna/pkg/tensor"
- )
- func BenchmarkRMSNorm(b *testing.B) {
- dim := 256
- rows := 16
- data := make([]float32, rows*dim)
- for i := range data {
- data[i] = rand.Float32()
- }
- w := make([]float32, dim)
- for i := range w {
- w[i] = 1
- }
- x := cpu.NewTensor(tensor.Shape{rows, dim}, data)
- ws := cpu.NewTensor(tensor.Shape{dim}, w)
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- RMSNorm(x, ws, 1e-5)
- }
- }
- func BenchmarkCausalAttentionPackedVsBlocks(b *testing.B) {
- // Decode-like: newTokens=1, numHeads=16, numKVHeads=8, headDim=128.
- numHeads, numKVHeads, headDim := 16, 8, 128
- newTokens := 1
- kvDim := numKVHeads * headDim
- blockSize := 16
- x := make([]float32, newTokens*numHeads*headDim)
- for i := range x {
- x[i] = rand.Float32() - 0.5
- }
- q := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, x)
- out := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, nil)
- for _, kvLen := range []int{256, 1024, 4096, 16384} {
- b.Run("kvLen="+strconv.Itoa(kvLen), func(b *testing.B) {
- kData := make([]float32, kvLen*kvDim)
- vData := make([]float32, kvLen*kvDim)
- for i := range kData {
- kData[i] = rand.Float32() - 0.5
- }
- for i := range vData {
- vData[i] = rand.Float32() - 0.5
- }
- // Build both view types over the same data.
- views := make([]kvcache.View, 0, (kvLen+blockSize-1)/blockSize)
- pviews := make([]kvcache.PackedView, 0, (kvLen+blockSize-1)/blockSize)
- for start := 0; start < kvLen; start += blockSize {
- length := blockSize
- if start+length > kvLen {
- length = kvLen - start
- }
- // token-major blocks
- kBlk := cpu.NewTensor(tensor.Shape{blockSize, kvDim}, kData[start*kvDim:(start+blockSize)*kvDim])
- vBlk := cpu.NewTensor(tensor.Shape{blockSize, kvDim}, vData[start*kvDim:(start+blockSize)*kvDim])
- views = append(views, kvcache.View{K: kBlk, V: vBlk, Start: start, Length: length, Device: tensor.CPU})
- // packed blocks
- pk := make([]float32, numKVHeads*blockSize*headDim)
- pv := make([]float32, numKVHeads*blockSize*headDim)
- // write packed
- for t := 0; t < length; t++ {
- baseTok := (start + t) * kvDim
- for h := 0; h < numKVHeads; h++ {
- srcBase := baseTok + h*headDim
- dstBase := h*(blockSize*headDim) + t*headDim
- copy(pk[dstBase:dstBase+headDim], kData[srcBase:srcBase+headDim])
- copy(pv[dstBase:dstBase+headDim], vData[srcBase:srcBase+headDim])
- }
- }
- pviews = append(pviews, kvcache.PackedView{K: pk, V: pv, Start: start, Length: length, BlockSize: blockSize, HeadDim: headDim, NumKVHeads: numKVHeads})
- }
- for _, fast := range []bool{false, true} {
- name := "exp=exact"
- if fast {
- name = "exp=fast"
- }
- b.Run(name, func(b *testing.B) {
- orig := useFastExp
- useFastExp = fast
- defer func() { useFastExp = orig }()
- b.Run("blocks", func(b *testing.B) {
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- _ = CausalAttentionBlocks(q, views, out, numHeads, numKVHeads, headDim, kvLen-1)
- }
- })
- b.Run("packed", func(b *testing.B) {
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- _ = CausalAttentionPackedBlocks(q, pviews, out, numHeads, numKVHeads, headDim, kvLen-1)
- }
- })
- })
- }
- })
- }
- }
- func BenchmarkSoftmax(b *testing.B) {
- n := 512
- data := make([]float32, n)
- for i := range data {
- data[i] = rand.Float32()
- }
- x := cpu.NewTensor(tensor.Shape{n}, data)
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- Softmax(x)
- }
- }
- func BenchmarkCausalAttentionCached_Fused(b *testing.B) {
- // Decode-like: newTokens=1, numHeads=16, numKVHeads=8, headDim=128.
- numHeads, numKVHeads, headDim := 16, 8, 128
- newTokens := 1
- kvDim := numKVHeads * headDim
- x := make([]float32, newTokens*numHeads*headDim)
- for i := range x {
- x[i] = rand.Float32() - 0.5
- }
- q := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, x)
- for _, kvLen := range []int{256, 1024, 4096, 16384} {
- b.Run("kvLen="+strconv.Itoa(kvLen), func(b *testing.B) {
- kData := make([]float32, kvLen*kvDim)
- vData := make([]float32, kvLen*kvDim)
- for i := range kData {
- kData[i] = rand.Float32() - 0.5
- }
- for i := range vData {
- vData[i] = rand.Float32() - 0.5
- }
- k := cpu.NewTensor(tensor.Shape{kvLen, kvDim}, kData)
- v := cpu.NewTensor(tensor.Shape{kvLen, kvDim}, vData)
- out := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, nil)
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- _ = CausalAttentionCached(q, k, v, out, numHeads, numKVHeads, headDim, kvLen-1)
- }
- })
- }
- }
|