//go:build cuda package tests import ( "math" "math/rand" "testing" "unsafe" "makarna/pkg/backend/cpu" cpunn "makarna/pkg/backend/cpu/nn" "makarna/pkg/backend/cuda" "makarna/pkg/quant" "makarna/pkg/tensor" ) func almostEq(a, b, atol, rtol float32) bool { d := a - b if d < 0 { d = -d } thr := atol + rtol*float32(math.Abs(float64(b))) return d <= thr } func assertAllClose(t *testing.T, name string, got, want []float32, atol, rtol float32) { if len(got) != len(want) { t.Fatalf("%s len mismatch got=%d want=%d", name, len(got), len(want)) } for i := range got { if !almostEq(got[i], want[i], atol, rtol) { t.Fatalf("%s mismatch at %d: got=%f want=%f (atol=%g rtol=%g)", name, i, got[i], want[i], atol, rtol) } } } func TestHarness_CUDA_DequantMatchesCPU(t *testing.T) { gpu := 0 if !cuda.Available() { t.Skip("cuda not available") } cases := []struct { name string seed int64 scale float32 }{ {name: "small", seed: 10, scale: 0.01}, {name: "medium", seed: 11, scale: 1.0}, {name: "large", seed: 12, scale: 50.0}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { r := rand.New(rand.NewSource(tc.seed)) inp := make([]float32, 256) for i := range inp { inp[i] = (r.Float32()*2 - 1) * tc.scale } q2 := quant.QuantizeQ2K(inp) q3 := quant.QuantizeQ3K(inp) q4 := quant.QuantizeQ4K(inp) q6 := quant.QuantizeQ6K(inp) q8 := quant.QuantizeQ8K(inp) ref2 := make([]float32, 256) ref3 := make([]float32, 256) ref4 := make([]float32, 256) ref6 := make([]float32, 256) ref8 := make([]float32, 256) tensor.DequantizeQ2_K((*tensor.BlockQ2_K)(unsafe.Pointer(&q2[0])), ref2) tensor.DequantizeQ3_K((*tensor.BlockQ3_K)(unsafe.Pointer(&q3[0])), ref3) tensor.DequantizeQ4_K((*tensor.BlockQ4_K)(unsafe.Pointer(&q4[0])), ref4) tensor.DequantizeQ6_K((*tensor.BlockQ6_K)(unsafe.Pointer(&q6[0])), ref6) tensor.DequantizeQ8_K((*tensor.BlockQ8_K)(unsafe.Pointer(&q8[0])), ref8) // Allocate output out, err := cuda.NewTensor(tensor.Shape{256}, tensor.Float32, gpu) if err != nil { t.Fatalf("new out: %v", err) } defer out.Free() // Q8 devQ8, err := cuda.UploadQ8K(q8, 1, gpu) if err != nil { t.Fatalf("upload q8: %v", err) } defer cuda.FreeDevicePtr(devQ8) if err := cuda.DequantQ8K(devQ8, out.Data().(unsafe.Pointer), 1, gpu); err != nil { t.Fatalf("dequant q8: %v", err) } h8 := make([]float32, 256) if err := out.CopyToHost(h8); err != nil { t.Fatalf("copy q8: %v", err) } assertAllClose(t, "q8k", h8, ref8, 1e-3, 1e-3) // Q4 devQ4, err := cuda.UploadQ4K(q4, 1, gpu) if err != nil { t.Fatalf("upload q4: %v", err) } defer cuda.FreeDevicePtr(devQ4) if err := cuda.DequantQ4K(devQ4, out.Data().(unsafe.Pointer), 1, gpu); err != nil { t.Fatalf("dequant q4: %v", err) } h4 := make([]float32, 256) if err := out.CopyToHost(h4); err != nil { t.Fatalf("copy q4: %v", err) } assertAllClose(t, "q4k", h4, ref4, 1e-2, 1e-2) // Q6 devQ6, err := cuda.UploadQ6K(q6, 1, gpu) if err != nil { t.Fatalf("upload q6: %v", err) } defer cuda.FreeDevicePtr(devQ6) if err := cuda.DequantQ6K(devQ6, out.Data().(unsafe.Pointer), 1, gpu); err != nil { t.Fatalf("dequant q6: %v", err) } h6 := make([]float32, 256) if err := out.CopyToHost(h6); err != nil { t.Fatalf("copy q6: %v", err) } assertAllClose(t, "q6k", h6, ref6, 1e-2, 1e-2) // Q3 devQ3, err := cuda.UploadQ3K(q3, 1, gpu) if err != nil { t.Fatalf("upload q3: %v", err) } defer cuda.FreeDevicePtr(devQ3) if err := cuda.DequantQ3K(devQ3, out.Data().(unsafe.Pointer), 1, gpu); err != nil { t.Fatalf("dequant q3: %v", err) } h3 := make([]float32, 256) if err := out.CopyToHost(h3); err != nil { t.Fatalf("copy q3: %v", err) } assertAllClose(t, "q3k", h3, ref3, 2e-2, 2e-2) // Q2 devQ2, err := cuda.UploadQ2K(q2, 1, gpu) if err != nil { t.Fatalf("upload q2: %v", err) } defer cuda.FreeDevicePtr(devQ2) if err := cuda.DequantQ2K(devQ2, out.Data().(unsafe.Pointer), 1, gpu); err != nil { t.Fatalf("dequant q2: %v", err) } h2 := make([]float32, 256) if err := out.CopyToHost(h2); err != nil { t.Fatalf("copy q2: %v", err) } assertAllClose(t, "q2k", h2, ref2, 5e-2, 5e-2) }) } } func TestHarness_CUDA_FusedMatMulMatchesCPUReference(t *testing.T) { gpu := 0 if !cuda.Available() { t.Skip("cuda not available") } // Keep small M,N but K must be 256-multiple for K-quants M, K, N := 3, 256, 4 r := rand.New(rand.NewSource(999)) // CPU inputs Ahost := make([]float32, M*K) Bhost := make([]float32, N*K) for i := range Ahost { Ahost[i] = r.Float32()*2 - 1 } for i := range Bhost { Bhost[i] = r.Float32()*2 - 1 } // CPU reference: C = A @ B^T ref := make([]float32, M*N) for m := 0; m < M; m++ { for n := 0; n < N; n++ { var s float32 for k := 0; k < K; k++ { s += Ahost[m*K+k] * Bhost[n*K+k] } ref[m*N+n] = s } } // Upload A to GPU Adev, err := cuda.NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu) if err != nil { t.Fatalf("Adev: %v", err) } defer Adev.Free() if err := Adev.CopyFrom(Ahost); err != nil { t.Fatalf("copy A: %v", err) } Cdev, err := cuda.NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu) if err != nil { t.Fatalf("Cdev: %v", err) } defer Cdev.Free() t.Run("q8k", func(t *testing.T) { q := quant.QuantizeQ8K(Bhost) devB, err := cuda.UploadQ8K(q, N*(K/256), gpu) if err != nil { t.Fatalf("upload: %v", err) } defer cuda.FreeDevicePtr(devB) if err := cuda.MatMulQ8K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil { t.Fatalf("matmul: %v", err) } h := make([]float32, M*N) _ = cuda.Synchronize(gpu) if err := Cdev.CopyToHost(h); err != nil { t.Fatalf("copy: %v", err) } assertAllClose(t, "matmul q8k", h, ref, 5e-1, 5e-2) }) t.Run("q4k", func(t *testing.T) { q := quant.QuantizeQ4K(Bhost) devB, err := cuda.UploadQ4K(q, N*(K/256), gpu) if err != nil { t.Fatalf("upload: %v", err) } defer cuda.FreeDevicePtr(devB) if err := cuda.MatMulQ4K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil { t.Fatalf("matmul: %v", err) } h := make([]float32, M*N) _ = cuda.Synchronize(gpu) if err := Cdev.CopyToHost(h); err != nil { t.Fatalf("copy: %v", err) } assertAllClose(t, "matmul q4k", h, ref, 2.0, 1e-1) }) // For Q2/Q3/Q6, API signatures in cuda.go do not take gpu param. // Keep them as separate subtests but use gpu=0 tensors. t.Run("q2k", func(t *testing.T) { q := quant.QuantizeQ2K(Bhost) devB, err := cuda.UploadQ2K(q, N*(K/256), gpu) if err != nil { t.Fatalf("upload: %v", err) } defer cuda.FreeDevicePtr(devB) if err := cuda.MatMulQ2K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil { t.Fatalf("matmul: %v", err) } h := make([]float32, M*N) _ = cuda.Synchronize(gpu) if err := Cdev.CopyToHost(h); err != nil { t.Fatalf("copy: %v", err) } assertAllClose(t, "matmul q2k", h, ref, 3.0, 2e-1) }) t.Run("q3k", func(t *testing.T) { q := quant.QuantizeQ3K(Bhost) devB, err := cuda.UploadQ3K(q, N*(K/256), gpu) if err != nil { t.Fatalf("upload: %v", err) } defer cuda.FreeDevicePtr(devB) if err := cuda.MatMulQ3K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil { t.Fatalf("matmul: %v", err) } h := make([]float32, M*N) _ = cuda.Synchronize(gpu) if err := Cdev.CopyToHost(h); err != nil { t.Fatalf("copy: %v", err) } assertAllClose(t, "matmul q3k", h, ref, 2.5, 2e-1) }) t.Run("q6k", func(t *testing.T) { q := quant.QuantizeQ6K(Bhost) devB, err := cuda.UploadQ6K(q, N*(K/256), gpu) if err != nil { t.Fatalf("upload: %v", err) } defer cuda.FreeDevicePtr(devB) if err := cuda.MatMulQ6K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil { t.Fatalf("matmul: %v", err) } h := make([]float32, M*N) _ = cuda.Synchronize(gpu) if err := Cdev.CopyToHost(h); err != nil { t.Fatalf("copy: %v", err) } assertAllClose(t, "matmul q6k", h, ref, 1.0, 1e-1) }) } func TestHarness_CUDA_NNOpsMatchCPU(t *testing.T) { gpu := 0 if !cuda.Available() { t.Skip("cuda not available") } seqLen := 4 headDim := 8 numHeads := 2 numKVHeads := 1 totalDim := numHeads * headDim r := rand.New(rand.NewSource(2025)) Q := make([]float32, seqLen*totalDim) K := make([]float32, seqLen*(numKVHeads*headDim)) V := make([]float32, seqLen*(numKVHeads*headDim)) W := make([]float32, totalDim) for i := range Q { Q[i] = r.Float32()*2 - 1 } for i := range K { K[i] = r.Float32()*2 - 1 } for i := range V { V[i] = r.Float32()*2 - 1 } for i := range W { W[i] = r.Float32()*2 - 1 } // CPU reference qCPU := cpu.NewTensor(tensor.Shape{seqLen, totalDim}, append([]float32(nil), Q...)) kCPU := cpu.NewTensor(tensor.Shape{seqLen, numKVHeads * headDim}, append([]float32(nil), K...)) vCPU := cpu.NewTensor(tensor.Shape{seqLen, numKVHeads * headDim}, append([]float32(nil), V...)) outCPU := cpu.NewTensor(tensor.Shape{seqLen, totalDim}, nil) if err := cpunn.CausalAttention(qCPU, kCPU, vCPU, outCPU, numHeads, numKVHeads, headDim); err != nil { t.Fatalf("cpu attention: %v", err) } // CUDA tensors qGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, totalDim}, tensor.Float32, gpu) kGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, numKVHeads * headDim}, tensor.Float32, gpu) vGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, numKVHeads * headDim}, tensor.Float32, gpu) outGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, totalDim}, tensor.Float32, gpu) wGPU, _ := cuda.NewTensor(tensor.Shape{totalDim}, tensor.Float32, gpu) qAttGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, totalDim}, tensor.Float32, gpu) defer qGPU.Free() defer kGPU.Free() defer vGPU.Free() defer outGPU.Free() defer wGPU.Free() defer qAttGPU.Free() _ = qGPU.CopyFrom(Q) _ = kGPU.CopyFrom(K) _ = vGPU.CopyFrom(V) _ = wGPU.CopyFrom(W) _ = qAttGPU.CopyFrom(Q) // RMSNorm CPU vs CUDA // Apply RMSNorm on a copy of Q qCPU2 := cpu.NewTensor(tensor.Shape{seqLen, totalDim}, append([]float32(nil), Q...)) wCPU2 := cpu.NewTensor(tensor.Shape{totalDim}, append([]float32(nil), W...)) if err := cpunn.RMSNorm(qCPU2, wCPU2, 1e-5); err != nil { t.Fatalf("cpu rmsnorm: %v", err) } if err := cuda.RMSNorm(qGPU.Data().(unsafe.Pointer), wGPU.Data().(unsafe.Pointer), seqLen, totalDim, 1e-5, gpu); err != nil { t.Fatalf("cuda rmsnorm: %v", err) } qR := make([]float32, seqLen*totalDim) _ = qGPU.CopyToHost(qR) assertAllClose(t, "rmsnorm", qR, qCPU2.DataFloat32(), 5e-3, 5e-3) // RoPE CPU vs CUDA pos := make([]int32, seqLen) posCPU := make([]int, seqLen) for i := range pos { pos[i] = int32(i) posCPU[i] = i } posDev, err := cuda.AllocAndCopyInt32(pos, gpu) if err != nil { t.Fatalf("alloc pos: %v", err) } defer cuda.FreeDevicePtr(posDev) qCPU3 := cpu.NewTensor(tensor.Shape{seqLen, totalDim}, append([]float32(nil), Q...)) if err := cpunn.RoPE(qCPU3, posCPU, headDim, 10000); err != nil { t.Fatalf("cpu rope: %v", err) } qGPU2, _ := cuda.NewTensor(tensor.Shape{seqLen, totalDim}, tensor.Float32, gpu) defer qGPU2.Free() _ = qGPU2.CopyFrom(Q) if err := cuda.RoPE(qGPU2.Data().(unsafe.Pointer), posDev, seqLen, numHeads, headDim, 10000, gpu); err != nil { t.Fatalf("cuda rope: %v", err) } qr := make([]float32, seqLen*totalDim) _ = qGPU2.CopyToHost(qr) assertAllClose(t, "rope", qr, qCPU3.DataFloat32(), 2e-2, 2e-2) // Softmax CPU vs CUDA on one row rowCPU := cpu.NewTensor(tensor.Shape{totalDim}, append([]float32(nil), Q[:totalDim]...)) if err := cpunn.Softmax(rowCPU); err != nil { t.Fatalf("cpu softmax: %v", err) } rowGPU, _ := cuda.NewTensor(tensor.Shape{1, totalDim}, tensor.Float32, gpu) defer rowGPU.Free() _ = rowGPU.CopyFrom(Q[:totalDim]) if err := cuda.Softmax(rowGPU.Data().(unsafe.Pointer), 1, totalDim, gpu); err != nil { t.Fatalf("cuda softmax: %v", err) } rowOut := make([]float32, totalDim) _ = rowGPU.CopyToHost(rowOut) assertAllClose(t, "softmax", rowOut, rowCPU.DataFloat32(), 2e-3, 2e-3) // Attention CPU vs CUDA scale := float32(1.0 / math.Sqrt(float64(headDim))) if err := cuda.Attention(qAttGPU.Data().(unsafe.Pointer), kGPU.Data().(unsafe.Pointer), vGPU.Data().(unsafe.Pointer), outGPU.Data().(unsafe.Pointer), seqLen, seqLen, numHeads, numKVHeads, headDim, scale, 0, gpu); err != nil { t.Fatalf("cuda attention: %v", err) } outH := make([]float32, seqLen*totalDim) _ = outGPU.CopyToHost(outH) assertAllClose(t, "attention", outH, outCPU.DataFloat32(), 5e-2, 5e-2) } func TestHarness_CUDA_PagedAttentionBatchMatchesSingle(t *testing.T) { gpu := 0 if !cuda.Available() { t.Skip("cuda not available") } blockSize := 4 kvLen0 := 5 kvLen1 := 6 headDim := 8 numHeads := 2 numKVHeads := 1 kvStride := numKVHeads * headDim scale := float32(1.0 / math.Sqrt(float64(headDim))) // Decode-style: one token per sequence. numTokens := 2 qGPU, err := cuda.NewTensor(tensor.Shape{numTokens, numHeads * headDim}, tensor.Float32, gpu) if err != nil { t.Fatalf("new q: %v", err) } defer qGPU.Free() outBatchGPU, err := cuda.NewTensor(tensor.Shape{numTokens, numHeads * headDim}, tensor.Float32, gpu) if err != nil { t.Fatalf("new out batch: %v", err) } defer outBatchGPU.Free() // Per-sequence outputs. out0GPU, _ := cuda.NewTensor(tensor.Shape{1, numHeads * headDim}, tensor.Float32, gpu) out1GPU, _ := cuda.NewTensor(tensor.Shape{1, numHeads * headDim}, tensor.Float32, gpu) defer out0GPU.Free() defer out1GPU.Free() r := rand.New(rand.NewSource(777)) qHost := make([]float32, numTokens*numHeads*headDim) for i := range qHost { qHost[i] = r.Float32()*2 - 1 } if err := qGPU.CopyFrom(qHost); err != nil { t.Fatalf("copy q: %v", err) } // Build paged K/V blocks for each sequence. makeSeqBlocks := func(kvLen int) ([]*cuda.Tensor, []*cuda.Tensor, []uintptr, []uintptr) { nBlocks := (kvLen + blockSize - 1) / blockSize kBlocks := make([]*cuda.Tensor, nBlocks) vBlocks := make([]*cuda.Tensor, nBlocks) kPtrs := make([]uintptr, nBlocks) vPtrs := make([]uintptr, nBlocks) for b := 0; b < nBlocks; b++ { kT, err := cuda.NewTensor(tensor.Shape{blockSize, kvStride}, tensor.Float32, gpu) if err != nil { t.Fatalf("new k block: %v", err) } vT, err := cuda.NewTensor(tensor.Shape{blockSize, kvStride}, tensor.Float32, gpu) if err != nil { t.Fatalf("new v block: %v", err) } kBlocks[b] = kT vBlocks[b] = vT kPtrs[b] = uintptr(kT.Data().(unsafe.Pointer)) vPtrs[b] = uintptr(vT.Data().(unsafe.Pointer)) kHost := make([]float32, blockSize*kvStride) vHost := make([]float32, blockSize*kvStride) for i := range kHost { kHost[i] = r.Float32()*2 - 1 vHost[i] = r.Float32()*2 - 1 } _ = kT.CopyFrom(kHost) _ = vT.CopyFrom(vHost) } return kBlocks, vBlocks, kPtrs, vPtrs } kBlocks0, vBlocks0, kPtrs0, vPtrs0 := makeSeqBlocks(kvLen0) kBlocks1, vBlocks1, kPtrs1, vPtrs1 := makeSeqBlocks(kvLen1) defer func() { for i := range kBlocks0 { kBlocks0[i].Free() vBlocks0[i].Free() } for i := range kBlocks1 { kBlocks1[i].Free() vBlocks1[i].Free() } }() // Reference: run single-seq paged attention for each token. kDev0, err := cuda.AllocAndCopyPtrTable(kPtrs0, gpu) if err != nil { t.Fatalf("alloc k ptrs0: %v", err) } defer cuda.FreeDevicePtr(kDev0) vDev0, err := cuda.AllocAndCopyPtrTable(vPtrs0, gpu) if err != nil { t.Fatalf("alloc v ptrs0: %v", err) } defer cuda.FreeDevicePtr(vDev0) kDev1, err := cuda.AllocAndCopyPtrTable(kPtrs1, gpu) if err != nil { t.Fatalf("alloc k ptrs1: %v", err) } defer cuda.FreeDevicePtr(kDev1) vDev1, err := cuda.AllocAndCopyPtrTable(vPtrs1, gpu) if err != nil { t.Fatalf("alloc v ptrs1: %v", err) } defer cuda.FreeDevicePtr(vDev1) q0View, _ := qGPU.ViewAt(tensor.Shape{1, numHeads * headDim}, 0) q1View, _ := qGPU.ViewAt(tensor.Shape{1, numHeads * headDim}, uintptr(numHeads*headDim*4)) if err := cuda.PagedAttention( q0View.Data().(unsafe.Pointer), kDev0, vDev0, out0GPU.Data().(unsafe.Pointer), 1, kvLen0, numHeads, numKVHeads, headDim, blockSize, scale, kvLen0-1, gpu, ); err != nil { t.Fatalf("paged attention 0: %v", err) } if err := cuda.PagedAttention( q1View.Data().(unsafe.Pointer), kDev1, vDev1, out1GPU.Data().(unsafe.Pointer), 1, kvLen1, numHeads, numKVHeads, headDim, blockSize, scale, kvLen1-1, gpu, ); err != nil { t.Fatalf("paged attention 1: %v", err) } // Batched: flatten block pointer tables. flatKPtrs := append(append([]uintptr(nil), kPtrs0...), kPtrs1...) flatVPtrs := append(append([]uintptr(nil), vPtrs0...), vPtrs1...) kFlatDev, err := cuda.AllocAndCopyPtrTable(flatKPtrs, gpu) if err != nil { t.Fatalf("alloc flat k: %v", err) } defer cuda.FreeDevicePtr(kFlatDev) vFlatDev, err := cuda.AllocAndCopyPtrTable(flatVPtrs, gpu) if err != nil { t.Fatalf("alloc flat v: %v", err) } defer cuda.FreeDevicePtr(vFlatDev) blockOffsets := []int32{0, int32(len(kPtrs0))} kvLens := []int32{int32(kvLen0), int32(kvLen1)} queryPos := []int32{int32(kvLen0 - 1), int32(kvLen1 - 1)} maxKvLen := kvLen0 if kvLen1 > maxKvLen { maxKvLen = kvLen1 } offDev, err := cuda.AllocAndCopyInt32(blockOffsets, gpu) if err != nil { t.Fatalf("alloc offsets: %v", err) } defer cuda.FreeDevicePtr(offDev) kvDev, err := cuda.AllocAndCopyInt32(kvLens, gpu) if err != nil { t.Fatalf("alloc kv lens: %v", err) } defer cuda.FreeDevicePtr(kvDev) qposDev, err := cuda.AllocAndCopyInt32(queryPos, gpu) if err != nil { t.Fatalf("alloc qpos: %v", err) } defer cuda.FreeDevicePtr(qposDev) if err := cuda.PagedAttentionBatch( qGPU.Data().(unsafe.Pointer), kFlatDev, vFlatDev, offDev, kvDev, qposDev, outBatchGPU.Data().(unsafe.Pointer), numTokens, numHeads, numKVHeads, headDim, blockSize, scale, maxKvLen, gpu, ); err != nil { t.Fatalf("paged attention batch: %v", err) } outBatchHost := make([]float32, numTokens*numHeads*headDim) if err := outBatchGPU.CopyToHost(outBatchHost); err != nil { t.Fatalf("copy out batch: %v", err) } out0Host := make([]float32, numHeads*headDim) out1Host := make([]float32, numHeads*headDim) _ = out0GPU.CopyToHost(out0Host) _ = out1GPU.CopyToHost(out1Host) assertAllClose(t, "paged_attention_batch_tok0", outBatchHost[:numHeads*headDim], out0Host, 2e-2, 2e-2) assertAllClose(t, "paged_attention_batch_tok1", outBatchHost[numHeads*headDim:], out1Host, 2e-2, 2e-2) }