| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638 |
- //go:build cuda
- package tests
- import (
- "math"
- "math/rand"
- "testing"
- "unsafe"
- "makarna/pkg/backend/cpu"
- cpunn "makarna/pkg/backend/cpu/nn"
- "makarna/pkg/backend/cuda"
- "makarna/pkg/quant"
- "makarna/pkg/tensor"
- )
- func almostEq(a, b, atol, rtol float32) bool {
- d := a - b
- if d < 0 {
- d = -d
- }
- thr := atol + rtol*float32(math.Abs(float64(b)))
- return d <= thr
- }
- func assertAllClose(t *testing.T, name string, got, want []float32, atol, rtol float32) {
- if len(got) != len(want) {
- t.Fatalf("%s len mismatch got=%d want=%d", name, len(got), len(want))
- }
- for i := range got {
- if !almostEq(got[i], want[i], atol, rtol) {
- t.Fatalf("%s mismatch at %d: got=%f want=%f (atol=%g rtol=%g)", name, i, got[i], want[i], atol, rtol)
- }
- }
- }
- func TestHarness_CUDA_DequantMatchesCPU(t *testing.T) {
- gpu := 0
- if !cuda.Available() {
- t.Skip("cuda not available")
- }
- cases := []struct {
- name string
- seed int64
- scale float32
- }{
- {name: "small", seed: 10, scale: 0.01},
- {name: "medium", seed: 11, scale: 1.0},
- {name: "large", seed: 12, scale: 50.0},
- }
- for _, tc := range cases {
- t.Run(tc.name, func(t *testing.T) {
- r := rand.New(rand.NewSource(tc.seed))
- inp := make([]float32, 256)
- for i := range inp {
- inp[i] = (r.Float32()*2 - 1) * tc.scale
- }
- q2 := quant.QuantizeQ2K(inp)
- q3 := quant.QuantizeQ3K(inp)
- q4 := quant.QuantizeQ4K(inp)
- q6 := quant.QuantizeQ6K(inp)
- q8 := quant.QuantizeQ8K(inp)
- ref2 := make([]float32, 256)
- ref3 := make([]float32, 256)
- ref4 := make([]float32, 256)
- ref6 := make([]float32, 256)
- ref8 := make([]float32, 256)
- tensor.DequantizeQ2_K((*tensor.BlockQ2_K)(unsafe.Pointer(&q2[0])), ref2)
- tensor.DequantizeQ3_K((*tensor.BlockQ3_K)(unsafe.Pointer(&q3[0])), ref3)
- tensor.DequantizeQ4_K((*tensor.BlockQ4_K)(unsafe.Pointer(&q4[0])), ref4)
- tensor.DequantizeQ6_K((*tensor.BlockQ6_K)(unsafe.Pointer(&q6[0])), ref6)
- tensor.DequantizeQ8_K((*tensor.BlockQ8_K)(unsafe.Pointer(&q8[0])), ref8)
- // Allocate output
- out, err := cuda.NewTensor(tensor.Shape{256}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("new out: %v", err)
- }
- defer out.Free()
- // Q8
- devQ8, err := cuda.UploadQ8K(q8, 1, gpu)
- if err != nil {
- t.Fatalf("upload q8: %v", err)
- }
- defer cuda.FreeDevicePtr(devQ8)
- if err := cuda.DequantQ8K(devQ8, out.Data().(unsafe.Pointer), 1, gpu); err != nil {
- t.Fatalf("dequant q8: %v", err)
- }
- h8 := make([]float32, 256)
- if err := out.CopyToHost(h8); err != nil {
- t.Fatalf("copy q8: %v", err)
- }
- assertAllClose(t, "q8k", h8, ref8, 1e-3, 1e-3)
- // Q4
- devQ4, err := cuda.UploadQ4K(q4, 1, gpu)
- if err != nil {
- t.Fatalf("upload q4: %v", err)
- }
- defer cuda.FreeDevicePtr(devQ4)
- if err := cuda.DequantQ4K(devQ4, out.Data().(unsafe.Pointer), 1, gpu); err != nil {
- t.Fatalf("dequant q4: %v", err)
- }
- h4 := make([]float32, 256)
- if err := out.CopyToHost(h4); err != nil {
- t.Fatalf("copy q4: %v", err)
- }
- assertAllClose(t, "q4k", h4, ref4, 1e-2, 1e-2)
- // Q6
- devQ6, err := cuda.UploadQ6K(q6, 1, gpu)
- if err != nil {
- t.Fatalf("upload q6: %v", err)
- }
- defer cuda.FreeDevicePtr(devQ6)
- if err := cuda.DequantQ6K(devQ6, out.Data().(unsafe.Pointer), 1, gpu); err != nil {
- t.Fatalf("dequant q6: %v", err)
- }
- h6 := make([]float32, 256)
- if err := out.CopyToHost(h6); err != nil {
- t.Fatalf("copy q6: %v", err)
- }
- assertAllClose(t, "q6k", h6, ref6, 1e-2, 1e-2)
- // Q3
- devQ3, err := cuda.UploadQ3K(q3, 1, gpu)
- if err != nil {
- t.Fatalf("upload q3: %v", err)
- }
- defer cuda.FreeDevicePtr(devQ3)
- if err := cuda.DequantQ3K(devQ3, out.Data().(unsafe.Pointer), 1, gpu); err != nil {
- t.Fatalf("dequant q3: %v", err)
- }
- h3 := make([]float32, 256)
- if err := out.CopyToHost(h3); err != nil {
- t.Fatalf("copy q3: %v", err)
- }
- assertAllClose(t, "q3k", h3, ref3, 2e-2, 2e-2)
- // Q2
- devQ2, err := cuda.UploadQ2K(q2, 1, gpu)
- if err != nil {
- t.Fatalf("upload q2: %v", err)
- }
- defer cuda.FreeDevicePtr(devQ2)
- if err := cuda.DequantQ2K(devQ2, out.Data().(unsafe.Pointer), 1, gpu); err != nil {
- t.Fatalf("dequant q2: %v", err)
- }
- h2 := make([]float32, 256)
- if err := out.CopyToHost(h2); err != nil {
- t.Fatalf("copy q2: %v", err)
- }
- assertAllClose(t, "q2k", h2, ref2, 5e-2, 5e-2)
- })
- }
- }
- func TestHarness_CUDA_FusedMatMulMatchesCPUReference(t *testing.T) {
- gpu := 0
- if !cuda.Available() {
- t.Skip("cuda not available")
- }
- // Keep small M,N but K must be 256-multiple for K-quants
- M, K, N := 3, 256, 4
- r := rand.New(rand.NewSource(999))
- // CPU inputs
- Ahost := make([]float32, M*K)
- Bhost := make([]float32, N*K)
- for i := range Ahost {
- Ahost[i] = r.Float32()*2 - 1
- }
- for i := range Bhost {
- Bhost[i] = r.Float32()*2 - 1
- }
- // CPU reference: C = A @ B^T
- ref := make([]float32, M*N)
- for m := 0; m < M; m++ {
- for n := 0; n < N; n++ {
- var s float32
- for k := 0; k < K; k++ {
- s += Ahost[m*K+k] * Bhost[n*K+k]
- }
- ref[m*N+n] = s
- }
- }
- // Upload A to GPU
- Adev, err := cuda.NewTensor(tensor.Shape{M, K}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("Adev: %v", err)
- }
- defer Adev.Free()
- if err := Adev.CopyFrom(Ahost); err != nil {
- t.Fatalf("copy A: %v", err)
- }
- Cdev, err := cuda.NewTensor(tensor.Shape{M, N}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("Cdev: %v", err)
- }
- defer Cdev.Free()
- t.Run("q8k", func(t *testing.T) {
- q := quant.QuantizeQ8K(Bhost)
- devB, err := cuda.UploadQ8K(q, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("upload: %v", err)
- }
- defer cuda.FreeDevicePtr(devB)
- if err := cuda.MatMulQ8K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil {
- t.Fatalf("matmul: %v", err)
- }
- h := make([]float32, M*N)
- _ = cuda.Synchronize(gpu)
- if err := Cdev.CopyToHost(h); err != nil {
- t.Fatalf("copy: %v", err)
- }
- assertAllClose(t, "matmul q8k", h, ref, 5e-1, 5e-2)
- })
- t.Run("q4k", func(t *testing.T) {
- q := quant.QuantizeQ4K(Bhost)
- devB, err := cuda.UploadQ4K(q, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("upload: %v", err)
- }
- defer cuda.FreeDevicePtr(devB)
- if err := cuda.MatMulQ4K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil {
- t.Fatalf("matmul: %v", err)
- }
- h := make([]float32, M*N)
- _ = cuda.Synchronize(gpu)
- if err := Cdev.CopyToHost(h); err != nil {
- t.Fatalf("copy: %v", err)
- }
- assertAllClose(t, "matmul q4k", h, ref, 2.0, 1e-1)
- })
- // For Q2/Q3/Q6, API signatures in cuda.go do not take gpu param.
- // Keep them as separate subtests but use gpu=0 tensors.
- t.Run("q2k", func(t *testing.T) {
- q := quant.QuantizeQ2K(Bhost)
- devB, err := cuda.UploadQ2K(q, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("upload: %v", err)
- }
- defer cuda.FreeDevicePtr(devB)
- if err := cuda.MatMulQ2K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil {
- t.Fatalf("matmul: %v", err)
- }
- h := make([]float32, M*N)
- _ = cuda.Synchronize(gpu)
- if err := Cdev.CopyToHost(h); err != nil {
- t.Fatalf("copy: %v", err)
- }
- assertAllClose(t, "matmul q2k", h, ref, 3.0, 2e-1)
- })
- t.Run("q3k", func(t *testing.T) {
- q := quant.QuantizeQ3K(Bhost)
- devB, err := cuda.UploadQ3K(q, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("upload: %v", err)
- }
- defer cuda.FreeDevicePtr(devB)
- if err := cuda.MatMulQ3K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil {
- t.Fatalf("matmul: %v", err)
- }
- h := make([]float32, M*N)
- _ = cuda.Synchronize(gpu)
- if err := Cdev.CopyToHost(h); err != nil {
- t.Fatalf("copy: %v", err)
- }
- assertAllClose(t, "matmul q3k", h, ref, 2.5, 2e-1)
- })
- t.Run("q6k", func(t *testing.T) {
- q := quant.QuantizeQ6K(Bhost)
- devB, err := cuda.UploadQ6K(q, N*(K/256), gpu)
- if err != nil {
- t.Fatalf("upload: %v", err)
- }
- defer cuda.FreeDevicePtr(devB)
- if err := cuda.MatMulQ6K(Adev.Data().(unsafe.Pointer), devB, Cdev.Data().(unsafe.Pointer), M, K, N, gpu); err != nil {
- t.Fatalf("matmul: %v", err)
- }
- h := make([]float32, M*N)
- _ = cuda.Synchronize(gpu)
- if err := Cdev.CopyToHost(h); err != nil {
- t.Fatalf("copy: %v", err)
- }
- assertAllClose(t, "matmul q6k", h, ref, 1.0, 1e-1)
- })
- }
- func TestHarness_CUDA_NNOpsMatchCPU(t *testing.T) {
- gpu := 0
- if !cuda.Available() {
- t.Skip("cuda not available")
- }
- seqLen := 4
- headDim := 8
- numHeads := 2
- numKVHeads := 1
- totalDim := numHeads * headDim
- r := rand.New(rand.NewSource(2025))
- Q := make([]float32, seqLen*totalDim)
- K := make([]float32, seqLen*(numKVHeads*headDim))
- V := make([]float32, seqLen*(numKVHeads*headDim))
- W := make([]float32, totalDim)
- for i := range Q {
- Q[i] = r.Float32()*2 - 1
- }
- for i := range K {
- K[i] = r.Float32()*2 - 1
- }
- for i := range V {
- V[i] = r.Float32()*2 - 1
- }
- for i := range W {
- W[i] = r.Float32()*2 - 1
- }
- // CPU reference
- qCPU := cpu.NewTensor(tensor.Shape{seqLen, totalDim}, append([]float32(nil), Q...))
- kCPU := cpu.NewTensor(tensor.Shape{seqLen, numKVHeads * headDim}, append([]float32(nil), K...))
- vCPU := cpu.NewTensor(tensor.Shape{seqLen, numKVHeads * headDim}, append([]float32(nil), V...))
- outCPU := cpu.NewTensor(tensor.Shape{seqLen, totalDim}, nil)
- if err := cpunn.CausalAttention(qCPU, kCPU, vCPU, outCPU, numHeads, numKVHeads, headDim); err != nil {
- t.Fatalf("cpu attention: %v", err)
- }
- // CUDA tensors
- qGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, totalDim}, tensor.Float32, gpu)
- kGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, numKVHeads * headDim}, tensor.Float32, gpu)
- vGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, numKVHeads * headDim}, tensor.Float32, gpu)
- outGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, totalDim}, tensor.Float32, gpu)
- wGPU, _ := cuda.NewTensor(tensor.Shape{totalDim}, tensor.Float32, gpu)
- qAttGPU, _ := cuda.NewTensor(tensor.Shape{seqLen, totalDim}, tensor.Float32, gpu)
- defer qGPU.Free()
- defer kGPU.Free()
- defer vGPU.Free()
- defer outGPU.Free()
- defer wGPU.Free()
- defer qAttGPU.Free()
- _ = qGPU.CopyFrom(Q)
- _ = kGPU.CopyFrom(K)
- _ = vGPU.CopyFrom(V)
- _ = wGPU.CopyFrom(W)
- _ = qAttGPU.CopyFrom(Q)
- // RMSNorm CPU vs CUDA
- // Apply RMSNorm on a copy of Q
- qCPU2 := cpu.NewTensor(tensor.Shape{seqLen, totalDim}, append([]float32(nil), Q...))
- wCPU2 := cpu.NewTensor(tensor.Shape{totalDim}, append([]float32(nil), W...))
- if err := cpunn.RMSNorm(qCPU2, wCPU2, 1e-5); err != nil {
- t.Fatalf("cpu rmsnorm: %v", err)
- }
- if err := cuda.RMSNorm(qGPU.Data().(unsafe.Pointer), wGPU.Data().(unsafe.Pointer), seqLen, totalDim, 1e-5, gpu); err != nil {
- t.Fatalf("cuda rmsnorm: %v", err)
- }
- qR := make([]float32, seqLen*totalDim)
- _ = qGPU.CopyToHost(qR)
- assertAllClose(t, "rmsnorm", qR, qCPU2.DataFloat32(), 5e-3, 5e-3)
- // RoPE CPU vs CUDA
- pos := make([]int32, seqLen)
- posCPU := make([]int, seqLen)
- for i := range pos {
- pos[i] = int32(i)
- posCPU[i] = i
- }
- posDev, err := cuda.AllocAndCopyInt32(pos, gpu)
- if err != nil {
- t.Fatalf("alloc pos: %v", err)
- }
- defer cuda.FreeDevicePtr(posDev)
- qCPU3 := cpu.NewTensor(tensor.Shape{seqLen, totalDim}, append([]float32(nil), Q...))
- if err := cpunn.RoPE(qCPU3, posCPU, headDim, 10000); err != nil {
- t.Fatalf("cpu rope: %v", err)
- }
- qGPU2, _ := cuda.NewTensor(tensor.Shape{seqLen, totalDim}, tensor.Float32, gpu)
- defer qGPU2.Free()
- _ = qGPU2.CopyFrom(Q)
- if err := cuda.RoPE(qGPU2.Data().(unsafe.Pointer), posDev, seqLen, numHeads, headDim, 10000, gpu); err != nil {
- t.Fatalf("cuda rope: %v", err)
- }
- qr := make([]float32, seqLen*totalDim)
- _ = qGPU2.CopyToHost(qr)
- assertAllClose(t, "rope", qr, qCPU3.DataFloat32(), 2e-2, 2e-2)
- // Softmax CPU vs CUDA on one row
- rowCPU := cpu.NewTensor(tensor.Shape{totalDim}, append([]float32(nil), Q[:totalDim]...))
- if err := cpunn.Softmax(rowCPU); err != nil {
- t.Fatalf("cpu softmax: %v", err)
- }
- rowGPU, _ := cuda.NewTensor(tensor.Shape{1, totalDim}, tensor.Float32, gpu)
- defer rowGPU.Free()
- _ = rowGPU.CopyFrom(Q[:totalDim])
- if err := cuda.Softmax(rowGPU.Data().(unsafe.Pointer), 1, totalDim, gpu); err != nil {
- t.Fatalf("cuda softmax: %v", err)
- }
- rowOut := make([]float32, totalDim)
- _ = rowGPU.CopyToHost(rowOut)
- assertAllClose(t, "softmax", rowOut, rowCPU.DataFloat32(), 2e-3, 2e-3)
- // Attention CPU vs CUDA
- scale := float32(1.0 / math.Sqrt(float64(headDim)))
- if err := cuda.Attention(qAttGPU.Data().(unsafe.Pointer), kGPU.Data().(unsafe.Pointer), vGPU.Data().(unsafe.Pointer), outGPU.Data().(unsafe.Pointer), seqLen, seqLen, numHeads, numKVHeads, headDim, scale, 0, gpu); err != nil {
- t.Fatalf("cuda attention: %v", err)
- }
- outH := make([]float32, seqLen*totalDim)
- _ = outGPU.CopyToHost(outH)
- assertAllClose(t, "attention", outH, outCPU.DataFloat32(), 5e-2, 5e-2)
- }
- func TestHarness_CUDA_PagedAttentionBatchMatchesSingle(t *testing.T) {
- gpu := 0
- if !cuda.Available() {
- t.Skip("cuda not available")
- }
- blockSize := 4
- kvLen0 := 5
- kvLen1 := 6
- headDim := 8
- numHeads := 2
- numKVHeads := 1
- kvStride := numKVHeads * headDim
- scale := float32(1.0 / math.Sqrt(float64(headDim)))
- // Decode-style: one token per sequence.
- numTokens := 2
- qGPU, err := cuda.NewTensor(tensor.Shape{numTokens, numHeads * headDim}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("new q: %v", err)
- }
- defer qGPU.Free()
- outBatchGPU, err := cuda.NewTensor(tensor.Shape{numTokens, numHeads * headDim}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("new out batch: %v", err)
- }
- defer outBatchGPU.Free()
- // Per-sequence outputs.
- out0GPU, _ := cuda.NewTensor(tensor.Shape{1, numHeads * headDim}, tensor.Float32, gpu)
- out1GPU, _ := cuda.NewTensor(tensor.Shape{1, numHeads * headDim}, tensor.Float32, gpu)
- defer out0GPU.Free()
- defer out1GPU.Free()
- r := rand.New(rand.NewSource(777))
- qHost := make([]float32, numTokens*numHeads*headDim)
- for i := range qHost {
- qHost[i] = r.Float32()*2 - 1
- }
- if err := qGPU.CopyFrom(qHost); err != nil {
- t.Fatalf("copy q: %v", err)
- }
- // Build paged K/V blocks for each sequence.
- makeSeqBlocks := func(kvLen int) ([]*cuda.Tensor, []*cuda.Tensor, []uintptr, []uintptr) {
- nBlocks := (kvLen + blockSize - 1) / blockSize
- kBlocks := make([]*cuda.Tensor, nBlocks)
- vBlocks := make([]*cuda.Tensor, nBlocks)
- kPtrs := make([]uintptr, nBlocks)
- vPtrs := make([]uintptr, nBlocks)
- for b := 0; b < nBlocks; b++ {
- kT, err := cuda.NewTensor(tensor.Shape{blockSize, kvStride}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("new k block: %v", err)
- }
- vT, err := cuda.NewTensor(tensor.Shape{blockSize, kvStride}, tensor.Float32, gpu)
- if err != nil {
- t.Fatalf("new v block: %v", err)
- }
- kBlocks[b] = kT
- vBlocks[b] = vT
- kPtrs[b] = uintptr(kT.Data().(unsafe.Pointer))
- vPtrs[b] = uintptr(vT.Data().(unsafe.Pointer))
- kHost := make([]float32, blockSize*kvStride)
- vHost := make([]float32, blockSize*kvStride)
- for i := range kHost {
- kHost[i] = r.Float32()*2 - 1
- vHost[i] = r.Float32()*2 - 1
- }
- _ = kT.CopyFrom(kHost)
- _ = vT.CopyFrom(vHost)
- }
- return kBlocks, vBlocks, kPtrs, vPtrs
- }
- kBlocks0, vBlocks0, kPtrs0, vPtrs0 := makeSeqBlocks(kvLen0)
- kBlocks1, vBlocks1, kPtrs1, vPtrs1 := makeSeqBlocks(kvLen1)
- defer func() {
- for i := range kBlocks0 {
- kBlocks0[i].Free()
- vBlocks0[i].Free()
- }
- for i := range kBlocks1 {
- kBlocks1[i].Free()
- vBlocks1[i].Free()
- }
- }()
- // Reference: run single-seq paged attention for each token.
- kDev0, err := cuda.AllocAndCopyPtrTable(kPtrs0, gpu)
- if err != nil {
- t.Fatalf("alloc k ptrs0: %v", err)
- }
- defer cuda.FreeDevicePtr(kDev0)
- vDev0, err := cuda.AllocAndCopyPtrTable(vPtrs0, gpu)
- if err != nil {
- t.Fatalf("alloc v ptrs0: %v", err)
- }
- defer cuda.FreeDevicePtr(vDev0)
- kDev1, err := cuda.AllocAndCopyPtrTable(kPtrs1, gpu)
- if err != nil {
- t.Fatalf("alloc k ptrs1: %v", err)
- }
- defer cuda.FreeDevicePtr(kDev1)
- vDev1, err := cuda.AllocAndCopyPtrTable(vPtrs1, gpu)
- if err != nil {
- t.Fatalf("alloc v ptrs1: %v", err)
- }
- defer cuda.FreeDevicePtr(vDev1)
- q0View, _ := qGPU.ViewAt(tensor.Shape{1, numHeads * headDim}, 0)
- q1View, _ := qGPU.ViewAt(tensor.Shape{1, numHeads * headDim}, uintptr(numHeads*headDim*4))
- if err := cuda.PagedAttention(
- q0View.Data().(unsafe.Pointer),
- kDev0, vDev0,
- out0GPU.Data().(unsafe.Pointer),
- 1, kvLen0,
- numHeads, numKVHeads, headDim,
- blockSize,
- scale, kvLen0-1,
- gpu,
- ); err != nil {
- t.Fatalf("paged attention 0: %v", err)
- }
- if err := cuda.PagedAttention(
- q1View.Data().(unsafe.Pointer),
- kDev1, vDev1,
- out1GPU.Data().(unsafe.Pointer),
- 1, kvLen1,
- numHeads, numKVHeads, headDim,
- blockSize,
- scale, kvLen1-1,
- gpu,
- ); err != nil {
- t.Fatalf("paged attention 1: %v", err)
- }
- // Batched: flatten block pointer tables.
- flatKPtrs := append(append([]uintptr(nil), kPtrs0...), kPtrs1...)
- flatVPtrs := append(append([]uintptr(nil), vPtrs0...), vPtrs1...)
- kFlatDev, err := cuda.AllocAndCopyPtrTable(flatKPtrs, gpu)
- if err != nil {
- t.Fatalf("alloc flat k: %v", err)
- }
- defer cuda.FreeDevicePtr(kFlatDev)
- vFlatDev, err := cuda.AllocAndCopyPtrTable(flatVPtrs, gpu)
- if err != nil {
- t.Fatalf("alloc flat v: %v", err)
- }
- defer cuda.FreeDevicePtr(vFlatDev)
- blockOffsets := []int32{0, int32(len(kPtrs0))}
- kvLens := []int32{int32(kvLen0), int32(kvLen1)}
- queryPos := []int32{int32(kvLen0 - 1), int32(kvLen1 - 1)}
- maxKvLen := kvLen0
- if kvLen1 > maxKvLen {
- maxKvLen = kvLen1
- }
- offDev, err := cuda.AllocAndCopyInt32(blockOffsets, gpu)
- if err != nil {
- t.Fatalf("alloc offsets: %v", err)
- }
- defer cuda.FreeDevicePtr(offDev)
- kvDev, err := cuda.AllocAndCopyInt32(kvLens, gpu)
- if err != nil {
- t.Fatalf("alloc kv lens: %v", err)
- }
- defer cuda.FreeDevicePtr(kvDev)
- qposDev, err := cuda.AllocAndCopyInt32(queryPos, gpu)
- if err != nil {
- t.Fatalf("alloc qpos: %v", err)
- }
- defer cuda.FreeDevicePtr(qposDev)
- if err := cuda.PagedAttentionBatch(
- qGPU.Data().(unsafe.Pointer),
- kFlatDev,
- vFlatDev,
- offDev,
- kvDev,
- qposDev,
- outBatchGPU.Data().(unsafe.Pointer),
- numTokens,
- numHeads, numKVHeads, headDim,
- blockSize,
- scale,
- maxKvLen,
- gpu,
- ); err != nil {
- t.Fatalf("paged attention batch: %v", err)
- }
- outBatchHost := make([]float32, numTokens*numHeads*headDim)
- if err := outBatchGPU.CopyToHost(outBatchHost); err != nil {
- t.Fatalf("copy out batch: %v", err)
- }
- out0Host := make([]float32, numHeads*headDim)
- out1Host := make([]float32, numHeads*headDim)
- _ = out0GPU.CopyToHost(out0Host)
- _ = out1GPU.CopyToHost(out1Host)
- assertAllClose(t, "paged_attention_batch_tok0", outBatchHost[:numHeads*headDim], out0Host, 2e-2, 2e-2)
- assertAllClose(t, "paged_attention_batch_tok1", outBatchHost[numHeads*headDim:], out1Host, 2e-2, 2e-2)
- }
|