package tensor import "testing" func TestDequantQ8KSimdMatchesScalar(t *testing.T) { var b BlockQ8_K for i := 0; i < 256; i++ { b.QS[i] = int8(i%17 - 8) } b.D = 0.125 var outSimd, outScalar [256]float32 if !dequantQ8KSimd(&b, outSimd[:]) { t.Skip("SIMD path not available on this CPU") } // Force scalar path origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false DequantizeQ8_K(&b, outScalar[:]) hasAVX2, hasAVX512 = origAVX2, origAVX512 for i := range outScalar { if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-6 { t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i]) } } } func TestDequantQ4KSimdMatchesScalar(t *testing.T) { var b BlockQ4_K b.D = 0x3C00 // float16(1.0) b.DMin = 0x3800 // float16(0.5) for i := range b.Scales { b.Scales[i] = uint8((i % 4) + 1) } for i := range b.QS { b.QS[i] = uint8(i % 256) } var outSimd, outScalar [256]float32 if !dequantQ4KSimd(&b, outSimd[:]) { t.Skip("SIMD path not available on this CPU") } origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false DequantizeQ4_K(&b, outScalar[:]) hasAVX2, hasAVX512 = origAVX2, origAVX512 for i := range outScalar { if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-5 { t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i]) } } } func TestDotQ4KSimdMatchesScalar(t *testing.T) { var b BlockQ4_K b.D = 0x3C00 b.DMin = 0x3800 for i := range b.Scales { b.Scales[i] = uint8((i % 4) + 1) } for i := range b.QS { b.QS[i] = uint8(i % 251) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } // SIMD path if !hasAVX2 { t.Skip("SIMD path not available on this CPU") } simd := DotQ4_K(&b, x) // Force scalar origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false scalar := DotQ4_K(&b, x) hasAVX2, hasAVX512 = origAVX2, origAVX512 if diff := absDiff(simd, scalar); diff > 1e-4 { t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar) } } func TestDotQ5KParamsMatchesScalar(t *testing.T) { if !hasAVX2 && !hasAVX512 { t.Skip("SIMD path not available on this CPU") } var b BlockQ5_K b.D = 0x3C00 b.DMin = 0x3800 for i := range b.Scales { b.Scales[i] = uint8(i*7 + 3) } for i := range b.QH { b.QH[i] = uint8(i * 13) } for i := range b.QS { b.QS[i] = uint8(i * 5) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } params := GetQ5KDotParams([]BlockQ5_K{b}) got := DotQ5_K_Params(&b, ¶ms[0], x) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false want := DotQ5_K_Params(&b, ¶ms[0], x) hasAVX2, hasAVX512 = origAVX2, origAVX512 if diff := absDiff(got, want); diff > 1e-4 { t.Fatalf("mismatch: got=%f want=%f", got, want) } } func TestDotQ5KTile8MatchesScalar(t *testing.T) { var blocks [8]BlockQ5_K for bi := range blocks { b := &blocks[bi] b.D = 0x3C00 b.DMin = 0x3800 for i := range b.Scales { b.Scales[i] = uint8((i + bi) * 3) } for i := range b.QH { b.QH[i] = uint8(i*5 + bi) } for i := range b.QS { b.QS[i] = uint8(i*7 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ5KDotParams(blocks[:]) var sums [8]float32 DotQ5KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) for i := 0; i < 8; i++ { want := DotQ5_K_Params(&blocks[i], &wp[i], x) if diff := absDiff(sums[i], want); diff > 1e-4 { t.Fatalf("mismatch at %d: got=%f want=%f", i, sums[i], want) } } } func TestDotQ6KParamsMatchesScalar(t *testing.T) { var b BlockQ6_K b.D = 0x3C00 for i := range b.Scales { b.Scales[i] = int8((i%11) - 5) } for i := range b.QL { b.QL[i] = uint8(i * 7) } for i := range b.QH { b.QH[i] = uint8(i * 13) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } params := GetQ6KDotParams([]BlockQ6_K{b}) got := DotQ6_K_Params(&b, ¶ms[0], x) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false want := DotQ6_K(&b, x) hasAVX2, hasAVX512 = origAVX2, origAVX512 if diff := absDiff(got, want); diff > 1e-4 { t.Fatalf("mismatch: got=%f want=%f", got, want) } } func TestDotQ6KTile8MatchesScalar(t *testing.T) { var blocks [8]BlockQ6_K for bi := range blocks { b := &blocks[bi] b.D = 0x3C00 for i := range b.Scales { b.Scales[i] = int8(((i + bi) % 13) - 6) } for i := range b.QL { b.QL[i] = uint8(i*3 + bi) } for i := range b.QH { b.QH[i] = uint8(i*5 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ6KDotParams(blocks[:]) var sums [8]float32 DotQ6KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) for i := 0; i < 8; i++ { want := DotQ6_K_Params(&blocks[i], &wp[i], x) if diff := absDiff(sums[i], want); diff > 1e-4 { t.Fatalf("mismatch at %d: got=%f want=%f", i, sums[i], want) } } } func TestDotQ8KSimdMatchesScalar(t *testing.T) { var b BlockQ8_K for i := 0; i < 256; i++ { b.QS[i] = int8((i%31)-15) } b.D = 0.125 x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } if !hasAVX2 && !hasAVX512 { t.Skip("SIMD path not available on this CPU") } simd := DotQ8_K(&b, x) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false scalar := DotQ8_K(&b, x) hasAVX2, hasAVX512 = origAVX2, origAVX512 if diff := absDiff(simd, scalar); diff > 1e-3 { t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar) } } func TestDotQ2KMatchesDequantDot(t *testing.T) { var b BlockQ2_K b.D = 0x3C00 b.DMin = 0x3800 for i := range b.Scales { b.Scales[i] = uint8((i%16)*16 + i%16) } for i := range b.QS { b.QS[i] = uint8(i * 3) } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } got := DotQ2_K(&b, x) var deq [256]float32 DequantizeQ2_K(&b, deq[:]) var want float32 for i := 0; i < 256; i++ { want += x[i] * deq[i] } if diff := absDiff(got, want); diff > 1e-4 { t.Fatalf("mismatch: got=%f want=%f", got, want) } } func TestDotQ3KMatchesDequantDot(t *testing.T) { var b BlockQ3_K b.D = 0x3C00 for i := range b.Scales { b.Scales[i] = uint8(i * 7) } for i := range b.QS { b.QS[i] = uint8(i) } for i := range b.HMask { b.HMask[i] = uint8(i * 5) } x := make([]float32, 256) for i := range x { x[i] = float32((i%17)-8) * 0.25 } got := DotQ3_K(&b, x) var deq [256]float32 DequantizeQ3_K(&b, deq[:]) var want float32 for i := 0; i < 256; i++ { want += x[i] * deq[i] } if diff := absDiff(got, want); diff > 1e-4 { t.Fatalf("mismatch: got=%f want=%f", got, want) } } func TestDotQ2KSimdMatchesScalar(t *testing.T) { if !hasAVX2 { t.Skip("SIMD path not available on this CPU") } var b BlockQ2_K b.D = 0x3C00 b.DMin = 0x3800 for i := range b.Scales { b.Scales[i] = uint8((i%16)*16 + i%16) } for i := range b.QS { b.QS[i] = uint8(i * 7) } x := make([]float32, 256) for i := range x { x[i] = float32((i%29)-14) * 0.125 } simd := DotQ2_K(&b, x) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false scalar := DotQ2_K(&b, x) hasAVX2, hasAVX512 = origAVX2, origAVX512 if diff := absDiff(simd, scalar); diff > 1e-4 { t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar) } } func TestDotQ3KSimdMatchesScalar(t *testing.T) { if !hasAVX2 { t.Skip("SIMD path not available on this CPU") } var b BlockQ3_K b.D = 0x3C00 for i := range b.Scales { b.Scales[i] = uint8(i*11 + 3) } for i := range b.QS { b.QS[i] = uint8(i * 5) } for i := range b.HMask { b.HMask[i] = uint8(i * 13) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } simd := DotQ3_K(&b, x) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false scalar := DotQ3_K(&b, x) hasAVX2, hasAVX512 = origAVX2, origAVX512 if diff := absDiff(simd, scalar); diff > 1e-4 { t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar) } } func TestDotQ2KParamsMatchesScalar(t *testing.T) { var b BlockQ2_K b.D = 0x3C00 b.DMin = 0x3800 for i := range b.Scales { b.Scales[i] = uint8(i*7 + 3) } for i := range b.QS { b.QS[i] = uint8(i * 5) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } params := GetQ2KDotParams([]BlockQ2_K{b}) got := DotQ2_K_Params(&b, ¶ms[0], x) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false want := DotQ2_K(&b, x) hasAVX2, hasAVX512 = origAVX2, origAVX512 if diff := absDiff(got, want); diff > 1e-4 { t.Fatalf("mismatch: got=%f want=%f", got, want) } } func TestDotQ2KTile8MatchesScalar(t *testing.T) { var blocks [8]BlockQ2_K for bi := range blocks { b := &blocks[bi] b.D = 0x3C00 b.DMin = 0x3800 for i := range b.Scales { b.Scales[i] = uint8((i + bi) * 3) } for i := range b.QS { b.QS[i] = uint8(i*7 + bi) } } x := make([]float32, 256) for i := range x { x[i] = float32((i%19)-9) * 0.125 } wp := GetQ2KDotParams(blocks[:]) var sums [8]float32 DotQ2KTile8(&sums, blocks[:], wp, 0, 1, &x[0], 8) for i := 0; i < 8; i++ { want := DotQ2_K_Params(&blocks[i], &wp[i], x) if diff := absDiff(sums[i], want); diff > 1e-4 { t.Fatalf("mismatch at %d: got=%f want=%f", i, sums[i], want) } } } func TestDotQ8KTile8MatchesScalar(t *testing.T) { var blocks [8]BlockQ8_K for bi := range blocks { b := &blocks[bi] for i := 0; i < 256; i++ { b.QS[i] = int8((i+bi)%31 - 15) } b.D = 0.125 } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } var sums [8]float32 DotQ8KTile8(&sums, blocks[:], 0, 1, &x[0], 8) for i := 0; i < 8; i++ { want := DotQ8_K(&blocks[i], x) if diff := absDiff(sums[i], want); diff > 1e-3 { t.Fatalf("mismatch at %d: got=%f want=%f", i, sums[i], want) } } } func TestDotQ6KSimdMatchesScalar(t *testing.T) { if !hasAVX2 { t.Skip("SIMD path not available on this CPU") } if !q6kSimdReady() { t.Skip("Q6K SIMD gate disabled") } var b BlockQ6_K b.D = 0x3C00 for i := range b.Scales { b.Scales[i] = int8((i % 16) - 8) } for i := range b.QL { b.QL[i] = uint8(i) } for i := range b.QH { b.QH[i] = uint8(i * 3) } x := make([]float32, 256) for i := range x { x[i] = float32((i%23)-11) * 0.25 } simd := DotQ6_K(&b, x) origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false scalar := DotQ6_K(&b, x) hasAVX2, hasAVX512 = origAVX2, origAVX512 if diff := absDiff(simd, scalar); diff > 1e-3 { t.Fatalf("mismatch: simd=%f scalar=%f", simd, scalar) } } func TestDequantQ2KSimdMatchesScalar(t *testing.T) { var b BlockQ2_K b.D = 0x3C00 // float16(1.0) b.DMin = 0x3800 // float16(0.5) for i := range b.Scales { b.Scales[i] = uint8((i%16)*16 + i%16) // both scales and mins } for i := range b.QS { b.QS[i] = uint8(i) } var outSimd, outScalar [256]float32 if !dequantQ2KSimd(&b, outSimd[:]) { t.Skip("SIMD path not available on this CPU") } origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false DequantizeQ2_K(&b, outScalar[:]) hasAVX2, hasAVX512 = origAVX2, origAVX512 for i := range outScalar { if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-5 { t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i]) } } } func TestDequantQ3KSimdMatchesScalar(t *testing.T) { var b BlockQ3_K b.D = 0x3C00 // float16(1.0) for i := range b.Scales { b.Scales[i] = uint8(i % 64) } for i := range b.QS { b.QS[i] = uint8(i) } for i := range b.HMask { b.HMask[i] = uint8(i * 7) } var outSimd, outScalar [256]float32 if !dequantQ3KSimd(&b, outSimd[:]) { t.Skip("SIMD path not available on this CPU") } origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false DequantizeQ3_K(&b, outScalar[:]) hasAVX2, hasAVX512 = origAVX2, origAVX512 for i := range outScalar { if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-5 { t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i]) } } } func TestDequantQ6KSimdMatchesScalar(t *testing.T) { var b BlockQ6_K b.D = 0x3C00 // float16(1.0) for i := range b.Scales { b.Scales[i] = int8((i % 16) - 8) } for i := range b.QL { b.QL[i] = uint8(i) } for i := range b.QH { b.QH[i] = uint8(i * 3) } var outSimd, outScalar [256]float32 if !dequantQ6KSimd(&b, outSimd[:]) { t.Skip("SIMD path not available on this CPU") } origAVX2, origAVX512 := hasAVX2, hasAVX512 hasAVX2, hasAVX512 = false, false DequantizeQ6_K(&b, outScalar[:]) hasAVX2, hasAVX512 = origAVX2, origAVX512 for i := range outScalar { if diff := absDiff(outSimd[i], outScalar[i]); diff > 1e-4 { t.Fatalf("mismatch at %d: simd=%f scalar=%f", i, outSimd[i], outScalar[i]) } } } func absDiff(a, b float32) float32 { if a > b { return a - b } return b - a }