package tensor import "testing" // BenchmarkDequantQ8K_SIMD benchmarks the SIMD dequantization path func BenchmarkDequantQ8K_SIMD(b *testing.B) { var block BlockQ8_K for i := 0; i < 256; i++ { block.QS[i] = int8(i%17 - 8) } block.D = 0.125 var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ8_K(&block, out[:]) } } // BenchmarkDequantQ8K_Scalar benchmarks the scalar fallback path func BenchmarkDequantQ8K_Scalar(b *testing.B) { var block BlockQ8_K for i := 0; i < 256; i++ { block.QS[i] = int8(i%17 - 8) } block.D = 0.125 // Force scalar path origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ8_K(&block, out[:]) } } // BenchmarkDequantQ4K_SIMD benchmarks the SIMD dequantization path func BenchmarkDequantQ4K_SIMD(b *testing.B) { var block BlockQ4_K block.D = 0x3C00 block.DMin = 0x3800 for i := range block.Scales { block.Scales[i] = uint8((i % 4) + 1) } for i := range block.QS { block.QS[i] = uint8(i) } var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ4_K(&block, out[:]) } } // BenchmarkDequantQ4K_Scalar benchmarks the scalar fallback path func BenchmarkDequantQ4K_Scalar(b *testing.B) { var block BlockQ4_K block.D = 0x3C00 block.DMin = 0x3800 for i := range block.Scales { block.Scales[i] = uint8((i % 4) + 1) } for i := range block.QS { block.QS[i] = uint8(i) } origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ4_K(&block, out[:]) } } // BenchmarkDequantQ2K_SIMD benchmarks the SIMD dequantization path func BenchmarkDequantQ2K_SIMD(b *testing.B) { var block BlockQ2_K block.D = 0x3C00 block.DMin = 0x3800 for i := range block.Scales { block.Scales[i] = uint8((i%16)*16 + i%16) } for i := range block.QS { block.QS[i] = uint8(i) } var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ2_K(&block, out[:]) } } // BenchmarkDequantQ2K_Scalar benchmarks the scalar fallback path func BenchmarkDequantQ2K_Scalar(b *testing.B) { var block BlockQ2_K block.D = 0x3C00 block.DMin = 0x3800 for i := range block.Scales { block.Scales[i] = uint8((i%16)*16 + i%16) } for i := range block.QS { block.QS[i] = uint8(i) } origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ2_K(&block, out[:]) } } // BenchmarkDequantQ3K_SIMD benchmarks the optimized Go path func BenchmarkDequantQ3K_SIMD(b *testing.B) { var block BlockQ3_K block.D = 0x3C00 for i := range block.Scales { block.Scales[i] = uint8(i % 64) } for i := range block.QS { block.QS[i] = uint8(i) } for i := range block.HMask { block.HMask[i] = uint8(i * 7) } var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ3_K(&block, out[:]) } } // BenchmarkDequantQ3K_Scalar benchmarks the scalar fallback path func BenchmarkDequantQ3K_Scalar(b *testing.B) { var block BlockQ3_K block.D = 0x3C00 for i := range block.Scales { block.Scales[i] = uint8(i % 64) } for i := range block.QS { block.QS[i] = uint8(i) } for i := range block.HMask { block.HMask[i] = uint8(i * 7) } origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ3_K(&block, out[:]) } } // BenchmarkDequantQ6K_SIMD benchmarks the optimized Go path func BenchmarkDequantQ6K_SIMD(b *testing.B) { var block BlockQ6_K block.D = 0x3C00 for i := range block.Scales { block.Scales[i] = int8((i % 16) - 8) } for i := range block.QL { block.QL[i] = uint8(i) } for i := range block.QH { block.QH[i] = uint8(i * 3) } var out [256]float32 b.ResetTimer() for i := 0; i < b.N; i++ { DequantizeQ6_K(&block, out[:]) } } func BenchmarkDotQ5K_Params_SIMD(b *testing.B) { var block BlockQ5_K block.D = 0x3C00 block.DMin = 0x3800 for i := range block.Scales { block.Scales[i] = uint8((i%7)*9 + 3) } for i := range block.QH { block.QH[i] = uint8(i * 13) } for i := range block.QS { block.QS[i] = uint8(i * 5) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } p := GetQ5KDotParams([]BlockQ5_K{block}) var sum float32 b.ResetTimer() for i := 0; i < b.N; i++ { sum += DotQ5_K_Params(&block, &p[0], x) } _ = sum } func BenchmarkDotQ5K_Params_Scalar(b *testing.B) { var block BlockQ5_K block.D = 0x3C00 block.DMin = 0x3800 for i := range block.Scales { block.Scales[i] = uint8((i%7)*9 + 3) } for i := range block.QH { block.QH[i] = uint8(i * 13) } for i := range block.QS { block.QS[i] = uint8(i * 5) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } p := GetQ5KDotParams([]BlockQ5_K{block}) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var sum float32 b.ResetTimer() for i := 0; i < b.N; i++ { sum += DotQ5_K_Params(&block, &p[0], x) } _ = sum } func BenchmarkDotQ5KTile8_SIMD(b *testing.B) { var blocks [8]BlockQ5_K for bi := range blocks { blk := &blocks[bi] blk.D = 0x3C00 blk.DMin = 0x3800 for i := range blk.Scales { blk.Scales[i] = uint8((i+bi)%11 + 1) } for i := range blk.QH { blk.QH[i] = uint8(i*5 + bi) } for i := range blk.QS { blk.QS[i] = uint8(i*7 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ5KDotParams(blocks[:]) var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ5KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ6K_Params_SIMD(b *testing.B) { var block BlockQ6_K block.D = 0x3C00 for i := range block.Scales { block.Scales[i] = int8((i%11) - 5) } for i := range block.QL { block.QL[i] = uint8(i * 7) } for i := range block.QH { block.QH[i] = uint8(i * 13) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } p := GetQ6KDotParams([]BlockQ6_K{block}) var sum float32 b.ResetTimer() for i := 0; i < b.N; i++ { sum += DotQ6_K_Params(&block, &p[0], x) } _ = sum } func BenchmarkDotQ6K_Params_Scalar(b *testing.B) { var block BlockQ6_K block.D = 0x3C00 for i := range block.Scales { block.Scales[i] = int8((i%11) - 5) } for i := range block.QL { block.QL[i] = uint8(i * 7) } for i := range block.QH { block.QH[i] = uint8(i * 13) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } p := GetQ6KDotParams([]BlockQ6_K{block}) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var sum float32 b.ResetTimer() for i := 0; i < b.N; i++ { sum += DotQ6_K_Params(&block, &p[0], x) } _ = sum } func BenchmarkDotQ6KTile8_SIMD(b *testing.B) { var blocks [8]BlockQ6_K for bi := range blocks { blk := &blocks[bi] blk.D = 0x3C00 for i := range blk.Scales { blk.Scales[i] = int8(((i + bi) % 13) - 6) } for i := range blk.QL { blk.QL[i] = uint8(i*3 + bi) } for i := range blk.QH { blk.QH[i] = uint8(i*5 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ6KDotParams(blocks[:]) var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ6KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ2K_Params_SIMD(b *testing.B) { var block BlockQ2_K block.D = 0x3C00 block.DMin = 0x3800 for i := range block.Scales { block.Scales[i] = uint8(i*7 + 3) } for i := range block.QS { block.QS[i] = uint8(i * 5) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } p := GetQ2KDotParams([]BlockQ2_K{block}) var sum float32 b.ResetTimer() for i := 0; i < b.N; i++ { sum += DotQ2_K_Params(&block, &p[0], x) } _ = sum } func BenchmarkDotQ2K_Params_Scalar(b *testing.B) { var block BlockQ2_K block.D = 0x3C00 block.DMin = 0x3800 for i := range block.Scales { block.Scales[i] = uint8(i*7 + 3) } for i := range block.QS { block.QS[i] = uint8(i * 5) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } p := GetQ2KDotParams([]BlockQ2_K{block}) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var sum float32 b.ResetTimer() for i := 0; i < b.N; i++ { sum += DotQ2_K_Params(&block, &p[0], x) } _ = sum } func BenchmarkDotQ2KTile8_SIMD(b *testing.B) { var blocks [8]BlockQ2_K for bi := range blocks { blk := &blocks[bi] blk.D = 0x3C00 blk.DMin = 0x3800 for i := range blk.Scales { blk.Scales[i] = uint8((i+bi)%11 + 1) } for i := range blk.QS { blk.QS[i] = uint8(i*7 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ2KDotParams(blocks[:]) var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ2KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ2KTile8_Scalar(b *testing.B) { var blocks [8]BlockQ2_K for bi := range blocks { blk := &blocks[bi] blk.D = 0x3C00 blk.DMin = 0x3800 for i := range blk.Scales { blk.Scales[i] = uint8((i+bi)%11 + 1) } for i := range blk.QS { blk.QS[i] = uint8(i*7 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ2KDotParams(blocks[:]) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ2KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ8KTile8_SIMD(b *testing.B) { var blocks [8]BlockQ8_K for bi := range blocks { blk := &blocks[bi] for i := 0; i < 256; i++ { blk.QS[i] = int8((i+bi)%31 - 15) } blk.D = 0.125 } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ8KTile8(&sums, blocks[:], 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ3K_Params_SIMD(b *testing.B) { var block BlockQ3_K block.D = 0x3C00 for i := range block.Scales { block.Scales[i] = uint8(i*11 + 3) } for i := range block.QS { block.QS[i] = uint8(i * 5) } for i := range block.HMask { block.HMask[i] = uint8(i * 13) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } p := GetQ3KDotParams([]BlockQ3_K{block}) var sum float32 b.ResetTimer() for i := 0; i < b.N; i++ { sum += DotQ3_K_Params(&block, &p[0], x) } _ = sum } func BenchmarkDotQ3K_Params_Scalar(b *testing.B) { var block BlockQ3_K block.D = 0x3C00 for i := range block.Scales { block.Scales[i] = uint8(i*11 + 3) } for i := range block.QS { block.QS[i] = uint8(i * 5) } for i := range block.HMask { block.HMask[i] = uint8(i * 13) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } p := GetQ3KDotParams([]BlockQ3_K{block}) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var sum float32 b.ResetTimer() for i := 0; i < b.N; i++ { sum += DotQ3_K_Params(&block, &p[0], x) } _ = sum } func BenchmarkDotQ3KTile8_SIMD(b *testing.B) { var blocks [8]BlockQ3_K for bi := range blocks { blk := &blocks[bi] blk.D = 0x3C00 for i := range blk.Scales { blk.Scales[i] = uint8((i+bi)%64) } for i := range blk.QS { blk.QS[i] = uint8(i*7 + bi) } for i := range blk.HMask { blk.HMask[i] = uint8(i*5 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ3KDotParams(blocks[:]) var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ3KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ3KTile8_Scalar(b *testing.B) { var blocks [8]BlockQ3_K for bi := range blocks { blk := &blocks[bi] blk.D = 0x3C00 for i := range blk.Scales { blk.Scales[i] = uint8((i+bi)%64) } for i := range blk.QS { blk.QS[i] = uint8(i*7 + bi) } for i := range blk.HMask { blk.HMask[i] = uint8(i*5 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ3KDotParams(blocks[:]) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ3KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ8KTile8_Scalar(b *testing.B) { var blocks [8]BlockQ8_K for bi := range blocks { blk := &blocks[bi] for i := 0; i < 256; i++ { blk.QS[i] = int8((i+bi)%31 - 15) } blk.D = 0.125 } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ8KTile8(&sums, blocks[:], 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ6KTile8_Scalar(b *testing.B) { var blocks [8]BlockQ6_K for bi := range blocks { blk := &blocks[bi] blk.D = 0x3C00 for i := range blk.Scales { blk.Scales[i] = int8(((i + bi) % 13) - 6) } for i := range blk.QL { blk.QL[i] = uint8(i*3 + bi) } for i := range blk.QH { blk.QH[i] = uint8(i*5 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ6KDotParams(blocks[:]) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ6KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) acc += sums[0] } _ = acc } func BenchmarkDotQ5KTile8_Scalar(b *testing.B) { var blocks [8]BlockQ5_K for bi := range blocks { blk := &blocks[bi] blk.D = 0x3C00 blk.DMin = 0x3800 for i := range blk.Scales { blk.Scales[i] = uint8((i+bi)%11 + 1) } for i := range blk.QH { blk.QH[i] = uint8(i*5 + bi) } for i := range blk.QS { blk.QS[i] = uint8(i*7 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ5KDotParams(blocks[:]) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }() var acc float32 b.ResetTimer() for i := 0; i < b.N; i++ { var sums [8]float32 DotQ5KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) acc += sums[0] } _ = acc }