| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759 |
- package tensor
- import "testing"
- // BenchmarkDequantQ8K_SIMD benchmarks the SIMD dequantization path
- func BenchmarkDequantQ8K_SIMD(b *testing.B) {
- var block BlockQ8_K
- for i := 0; i < 256; i++ {
- block.QS[i] = int8(i%17 - 8)
- }
- block.D = 0.125
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ8_K(&block, out[:])
- }
- }
- // BenchmarkDequantQ8K_Scalar benchmarks the scalar fallback path
- func BenchmarkDequantQ8K_Scalar(b *testing.B) {
- var block BlockQ8_K
- for i := 0; i < 256; i++ {
- block.QS[i] = int8(i%17 - 8)
- }
- block.D = 0.125
- // Force scalar path
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ8_K(&block, out[:])
- }
- }
- // BenchmarkDequantQ4K_SIMD benchmarks the SIMD dequantization path
- func BenchmarkDequantQ4K_SIMD(b *testing.B) {
- var block BlockQ4_K
- block.D = 0x3C00
- block.DMin = 0x3800
- for i := range block.Scales {
- block.Scales[i] = uint8((i % 4) + 1)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i)
- }
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ4_K(&block, out[:])
- }
- }
- // BenchmarkDequantQ4K_Scalar benchmarks the scalar fallback path
- func BenchmarkDequantQ4K_Scalar(b *testing.B) {
- var block BlockQ4_K
- block.D = 0x3C00
- block.DMin = 0x3800
- for i := range block.Scales {
- block.Scales[i] = uint8((i % 4) + 1)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i)
- }
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ4_K(&block, out[:])
- }
- }
- // BenchmarkDequantQ2K_SIMD benchmarks the SIMD dequantization path
- func BenchmarkDequantQ2K_SIMD(b *testing.B) {
- var block BlockQ2_K
- block.D = 0x3C00
- block.DMin = 0x3800
- for i := range block.Scales {
- block.Scales[i] = uint8((i%16)*16 + i%16)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i)
- }
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ2_K(&block, out[:])
- }
- }
- // BenchmarkDequantQ2K_Scalar benchmarks the scalar fallback path
- func BenchmarkDequantQ2K_Scalar(b *testing.B) {
- var block BlockQ2_K
- block.D = 0x3C00
- block.DMin = 0x3800
- for i := range block.Scales {
- block.Scales[i] = uint8((i%16)*16 + i%16)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i)
- }
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ2_K(&block, out[:])
- }
- }
- // BenchmarkDequantQ3K_SIMD benchmarks the optimized Go path
- func BenchmarkDequantQ3K_SIMD(b *testing.B) {
- var block BlockQ3_K
- block.D = 0x3C00
- for i := range block.Scales {
- block.Scales[i] = uint8(i % 64)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i)
- }
- for i := range block.HMask {
- block.HMask[i] = uint8(i * 7)
- }
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ3_K(&block, out[:])
- }
- }
- // BenchmarkDequantQ3K_Scalar benchmarks the scalar fallback path
- func BenchmarkDequantQ3K_Scalar(b *testing.B) {
- var block BlockQ3_K
- block.D = 0x3C00
- for i := range block.Scales {
- block.Scales[i] = uint8(i % 64)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i)
- }
- for i := range block.HMask {
- block.HMask[i] = uint8(i * 7)
- }
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ3_K(&block, out[:])
- }
- }
- // BenchmarkDequantQ6K_SIMD benchmarks the optimized Go path
- func BenchmarkDequantQ6K_SIMD(b *testing.B) {
- var block BlockQ6_K
- block.D = 0x3C00
- for i := range block.Scales {
- block.Scales[i] = int8((i % 16) - 8)
- }
- for i := range block.QL {
- block.QL[i] = uint8(i)
- }
- for i := range block.QH {
- block.QH[i] = uint8(i * 3)
- }
- var out [256]float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- DequantizeQ6_K(&block, out[:])
- }
- }
- func BenchmarkDotQ5K_Params_SIMD(b *testing.B) {
- var block BlockQ5_K
- block.D = 0x3C00
- block.DMin = 0x3800
- for i := range block.Scales {
- block.Scales[i] = uint8((i%7)*9 + 3)
- }
- for i := range block.QH {
- block.QH[i] = uint8(i * 13)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i * 5)
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- p := GetQ5KDotParams([]BlockQ5_K{block})
- var sum float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- sum += DotQ5_K_Params(&block, &p[0], x)
- }
- _ = sum
- }
- func BenchmarkDotQ5K_Params_Scalar(b *testing.B) {
- var block BlockQ5_K
- block.D = 0x3C00
- block.DMin = 0x3800
- for i := range block.Scales {
- block.Scales[i] = uint8((i%7)*9 + 3)
- }
- for i := range block.QH {
- block.QH[i] = uint8(i * 13)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i * 5)
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- p := GetQ5KDotParams([]BlockQ5_K{block})
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var sum float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- sum += DotQ5_K_Params(&block, &p[0], x)
- }
- _ = sum
- }
- func BenchmarkDotQ5KTile8_SIMD(b *testing.B) {
- var blocks [8]BlockQ5_K
- for bi := range blocks {
- blk := &blocks[bi]
- blk.D = 0x3C00
- blk.DMin = 0x3800
- for i := range blk.Scales {
- blk.Scales[i] = uint8((i+bi)%11 + 1)
- }
- for i := range blk.QH {
- blk.QH[i] = uint8(i*5 + bi)
- }
- for i := range blk.QS {
- blk.QS[i] = uint8(i*7 + bi)
- }
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%19)-9) * 0.125
- }
- wp := GetQ5KDotParams(blocks[:])
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ5KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ6K_Params_SIMD(b *testing.B) {
- var block BlockQ6_K
- block.D = 0x3C00
- for i := range block.Scales {
- block.Scales[i] = int8((i%11) - 5)
- }
- for i := range block.QL {
- block.QL[i] = uint8(i * 7)
- }
- for i := range block.QH {
- block.QH[i] = uint8(i * 13)
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- p := GetQ6KDotParams([]BlockQ6_K{block})
- var sum float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- sum += DotQ6_K_Params(&block, &p[0], x)
- }
- _ = sum
- }
- func BenchmarkDotQ6K_Params_Scalar(b *testing.B) {
- var block BlockQ6_K
- block.D = 0x3C00
- for i := range block.Scales {
- block.Scales[i] = int8((i%11) - 5)
- }
- for i := range block.QL {
- block.QL[i] = uint8(i * 7)
- }
- for i := range block.QH {
- block.QH[i] = uint8(i * 13)
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- p := GetQ6KDotParams([]BlockQ6_K{block})
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var sum float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- sum += DotQ6_K_Params(&block, &p[0], x)
- }
- _ = sum
- }
- func BenchmarkDotQ6KTile8_SIMD(b *testing.B) {
- var blocks [8]BlockQ6_K
- for bi := range blocks {
- blk := &blocks[bi]
- blk.D = 0x3C00
- for i := range blk.Scales {
- blk.Scales[i] = int8(((i + bi) % 13) - 6)
- }
- for i := range blk.QL {
- blk.QL[i] = uint8(i*3 + bi)
- }
- for i := range blk.QH {
- blk.QH[i] = uint8(i*5 + bi)
- }
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%19)-9) * 0.125
- }
- wp := GetQ6KDotParams(blocks[:])
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ6KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ2K_Params_SIMD(b *testing.B) {
- var block BlockQ2_K
- block.D = 0x3C00
- block.DMin = 0x3800
- for i := range block.Scales {
- block.Scales[i] = uint8(i*7 + 3)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i * 5)
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- p := GetQ2KDotParams([]BlockQ2_K{block})
- var sum float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- sum += DotQ2_K_Params(&block, &p[0], x)
- }
- _ = sum
- }
- func BenchmarkDotQ2K_Params_Scalar(b *testing.B) {
- var block BlockQ2_K
- block.D = 0x3C00
- block.DMin = 0x3800
- for i := range block.Scales {
- block.Scales[i] = uint8(i*7 + 3)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i * 5)
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- p := GetQ2KDotParams([]BlockQ2_K{block})
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var sum float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- sum += DotQ2_K_Params(&block, &p[0], x)
- }
- _ = sum
- }
- func BenchmarkDotQ2KTile8_SIMD(b *testing.B) {
- var blocks [8]BlockQ2_K
- for bi := range blocks {
- blk := &blocks[bi]
- blk.D = 0x3C00
- blk.DMin = 0x3800
- for i := range blk.Scales {
- blk.Scales[i] = uint8((i+bi)%11 + 1)
- }
- for i := range blk.QS {
- blk.QS[i] = uint8(i*7 + bi)
- }
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%19)-9) * 0.125
- }
- wp := GetQ2KDotParams(blocks[:])
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ2KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ2KTile8_Scalar(b *testing.B) {
- var blocks [8]BlockQ2_K
- for bi := range blocks {
- blk := &blocks[bi]
- blk.D = 0x3C00
- blk.DMin = 0x3800
- for i := range blk.Scales {
- blk.Scales[i] = uint8((i+bi)%11 + 1)
- }
- for i := range blk.QS {
- blk.QS[i] = uint8(i*7 + bi)
- }
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%19)-9) * 0.125
- }
- wp := GetQ2KDotParams(blocks[:])
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ2KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ8KTile8_SIMD(b *testing.B) {
- var blocks [8]BlockQ8_K
- for bi := range blocks {
- blk := &blocks[bi]
- for i := 0; i < 256; i++ {
- blk.QS[i] = int8((i+bi)%31 - 15)
- }
- blk.D = 0.125
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ8KTile8(&sums, blocks[:], 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ3K_Params_SIMD(b *testing.B) {
- var block BlockQ3_K
- block.D = 0x3C00
- for i := range block.Scales {
- block.Scales[i] = uint8(i*11 + 3)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i * 5)
- }
- for i := range block.HMask {
- block.HMask[i] = uint8(i * 13)
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- p := GetQ3KDotParams([]BlockQ3_K{block})
- var sum float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- sum += DotQ3_K_Params(&block, &p[0], x)
- }
- _ = sum
- }
- func BenchmarkDotQ3K_Params_Scalar(b *testing.B) {
- var block BlockQ3_K
- block.D = 0x3C00
- for i := range block.Scales {
- block.Scales[i] = uint8(i*11 + 3)
- }
- for i := range block.QS {
- block.QS[i] = uint8(i * 5)
- }
- for i := range block.HMask {
- block.HMask[i] = uint8(i * 13)
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- p := GetQ3KDotParams([]BlockQ3_K{block})
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var sum float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- sum += DotQ3_K_Params(&block, &p[0], x)
- }
- _ = sum
- }
- func BenchmarkDotQ3KTile8_SIMD(b *testing.B) {
- var blocks [8]BlockQ3_K
- for bi := range blocks {
- blk := &blocks[bi]
- blk.D = 0x3C00
- for i := range blk.Scales {
- blk.Scales[i] = uint8((i+bi)%64)
- }
- for i := range blk.QS {
- blk.QS[i] = uint8(i*7 + bi)
- }
- for i := range blk.HMask {
- blk.HMask[i] = uint8(i*5 + bi)
- }
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%19)-9) * 0.125
- }
- wp := GetQ3KDotParams(blocks[:])
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ3KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ3KTile8_Scalar(b *testing.B) {
- var blocks [8]BlockQ3_K
- for bi := range blocks {
- blk := &blocks[bi]
- blk.D = 0x3C00
- for i := range blk.Scales {
- blk.Scales[i] = uint8((i+bi)%64)
- }
- for i := range blk.QS {
- blk.QS[i] = uint8(i*7 + bi)
- }
- for i := range blk.HMask {
- blk.HMask[i] = uint8(i*5 + bi)
- }
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%19)-9) * 0.125
- }
- wp := GetQ3KDotParams(blocks[:])
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ3KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ8KTile8_Scalar(b *testing.B) {
- var blocks [8]BlockQ8_K
- for bi := range blocks {
- blk := &blocks[bi]
- for i := 0; i < 256; i++ {
- blk.QS[i] = int8((i+bi)%31 - 15)
- }
- blk.D = 0.125
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%23)-11) * 0.25
- }
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ8KTile8(&sums, blocks[:], 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ6KTile8_Scalar(b *testing.B) {
- var blocks [8]BlockQ6_K
- for bi := range blocks {
- blk := &blocks[bi]
- blk.D = 0x3C00
- for i := range blk.Scales {
- blk.Scales[i] = int8(((i + bi) % 13) - 6)
- }
- for i := range blk.QL {
- blk.QL[i] = uint8(i*3 + bi)
- }
- for i := range blk.QH {
- blk.QH[i] = uint8(i*5 + bi)
- }
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%19)-9) * 0.125
- }
- wp := GetQ6KDotParams(blocks[:])
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ6KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
- func BenchmarkDotQ5KTile8_Scalar(b *testing.B) {
- var blocks [8]BlockQ5_K
- for bi := range blocks {
- blk := &blocks[bi]
- blk.D = 0x3C00
- blk.DMin = 0x3800
- for i := range blk.Scales {
- blk.Scales[i] = uint8((i+bi)%11 + 1)
- }
- for i := range blk.QH {
- blk.QH[i] = uint8(i*5 + bi)
- }
- for i := range blk.QS {
- blk.QS[i] = uint8(i*7 + bi)
- }
- }
- x := make([]float32, 256)
- for i := range x {
- x[i] = float32((i%19)-9) * 0.125
- }
- wp := GetQ5KDotParams(blocks[:])
- origAVX2, origAVX512 := hasAVX2, hasAVX512
- hasAVX2, hasAVX512 = false, false
- defer func() { hasAVX2, hasAVX512 = origAVX2, origAVX512 }()
- var acc float32
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var sums [8]float32
- DotQ5KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8)
- acc += sums[0]
- }
- _ = acc
- }
|