package nn import ( "math/rand" "strconv" "testing" "makarna/pkg/backend/cpu" "makarna/pkg/kvcache" "makarna/pkg/tensor" ) func BenchmarkRMSNorm(b *testing.B) { dim := 256 rows := 16 data := make([]float32, rows*dim) for i := range data { data[i] = rand.Float32() } w := make([]float32, dim) for i := range w { w[i] = 1 } x := cpu.NewTensor(tensor.Shape{rows, dim}, data) ws := cpu.NewTensor(tensor.Shape{dim}, w) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { RMSNorm(x, ws, 1e-5) } } func BenchmarkCausalAttentionPackedVsBlocks(b *testing.B) { // Decode-like: newTokens=1, numHeads=16, numKVHeads=8, headDim=128. numHeads, numKVHeads, headDim := 16, 8, 128 newTokens := 1 kvDim := numKVHeads * headDim blockSize := 16 x := make([]float32, newTokens*numHeads*headDim) for i := range x { x[i] = rand.Float32() - 0.5 } q := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, x) out := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, nil) for _, kvLen := range []int{256, 1024, 4096, 16384} { b.Run("kvLen="+strconv.Itoa(kvLen), func(b *testing.B) { kData := make([]float32, kvLen*kvDim) vData := make([]float32, kvLen*kvDim) for i := range kData { kData[i] = rand.Float32() - 0.5 } for i := range vData { vData[i] = rand.Float32() - 0.5 } // Build both view types over the same data. views := make([]kvcache.View, 0, (kvLen+blockSize-1)/blockSize) pviews := make([]kvcache.PackedView, 0, (kvLen+blockSize-1)/blockSize) for start := 0; start < kvLen; start += blockSize { length := blockSize if start+length > kvLen { length = kvLen - start } // token-major blocks kBlk := cpu.NewTensor(tensor.Shape{blockSize, kvDim}, kData[start*kvDim:(start+blockSize)*kvDim]) vBlk := cpu.NewTensor(tensor.Shape{blockSize, kvDim}, vData[start*kvDim:(start+blockSize)*kvDim]) views = append(views, kvcache.View{K: kBlk, V: vBlk, Start: start, Length: length, Device: tensor.CPU}) // packed blocks pk := make([]float32, numKVHeads*blockSize*headDim) pv := make([]float32, numKVHeads*blockSize*headDim) // write packed for t := 0; t < length; t++ { baseTok := (start + t) * kvDim for h := 0; h < numKVHeads; h++ { srcBase := baseTok + h*headDim dstBase := h*(blockSize*headDim) + t*headDim copy(pk[dstBase:dstBase+headDim], kData[srcBase:srcBase+headDim]) copy(pv[dstBase:dstBase+headDim], vData[srcBase:srcBase+headDim]) } } pviews = append(pviews, kvcache.PackedView{K: pk, V: pv, Start: start, Length: length, BlockSize: blockSize, HeadDim: headDim, NumKVHeads: numKVHeads}) } for _, fast := range []bool{false, true} { name := "exp=exact" if fast { name = "exp=fast" } b.Run(name, func(b *testing.B) { orig := useFastExp useFastExp = fast defer func() { useFastExp = orig }() b.Run("blocks", func(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _ = CausalAttentionBlocks(q, views, out, numHeads, numKVHeads, headDim, kvLen-1) } }) b.Run("packed", func(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _ = CausalAttentionPackedBlocks(q, pviews, out, numHeads, numKVHeads, headDim, kvLen-1) } }) }) } }) } } func BenchmarkSoftmax(b *testing.B) { n := 512 data := make([]float32, n) for i := range data { data[i] = rand.Float32() } x := cpu.NewTensor(tensor.Shape{n}, data) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { Softmax(x) } } func BenchmarkCausalAttentionCached_Fused(b *testing.B) { // Decode-like: newTokens=1, numHeads=16, numKVHeads=8, headDim=128. numHeads, numKVHeads, headDim := 16, 8, 128 newTokens := 1 kvDim := numKVHeads * headDim x := make([]float32, newTokens*numHeads*headDim) for i := range x { x[i] = rand.Float32() - 0.5 } q := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, x) for _, kvLen := range []int{256, 1024, 4096, 16384} { b.Run("kvLen="+strconv.Itoa(kvLen), func(b *testing.B) { kData := make([]float32, kvLen*kvDim) vData := make([]float32, kvLen*kvDim) for i := range kData { kData[i] = rand.Float32() - 0.5 } for i := range vData { vData[i] = rand.Float32() - 0.5 } k := cpu.NewTensor(tensor.Shape{kvLen, kvDim}, kData) v := cpu.NewTensor(tensor.Shape{kvLen, kvDim}, vData) out := cpu.NewTensor(tensor.Shape{newTokens, numHeads * headDim}, nil) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _ = CausalAttentionCached(q, k, v, out, numHeads, numKVHeads, headDim, kvLen-1) } }) } }