//go:build cuda package cuda import ( "math" "testing" "unsafe" "makarna/pkg/backend/cpu/nn" "makarna/pkg/tensor" ) func TestL2NormHeads(t *testing.T) { if !Available() { t.Skip("CUDA not available") } tokens, numHeads, headDim := 4, 8, 64 n := tokens * numHeads * headDim eps := float32(1e-6) // CPU reference qCPU := make([]float32, n) kCPU := make([]float32, n) for i := range qCPU { qCPU[i] = float32(i%100) / 50.0 kCPU[i] = float32((i+37)%100) / 50.0 } qRef := make([]float32, n) kRef := make([]float32, n) copy(qRef, qCPU) copy(kRef, kCPU) nn.L2NormHeads(qRef, kRef, tokens, numHeads, headDim, eps) // GPU qDev, _ := NewTensor(tensor.Shape{tokens, numHeads * headDim}, tensor.Float32, 0) defer qDev.Free() kDev, _ := NewTensor(tensor.Shape{tokens, numHeads * headDim}, tensor.Float32, 0) defer kDev.Free() qDev.CopyFrom(qCPU) kDev.CopyFrom(kCPU) if err := L2NormHeads(qDev.Data().(unsafe.Pointer), kDev.Data().(unsafe.Pointer), tokens, numHeads, headDim, eps, 0); err != nil { t.Fatal(err) } qOut := make([]float32, n) kOut := make([]float32, n) qDev.CopyToHost(qOut) kDev.CopyToHost(kOut) for i := 0; i < n; i++ { if math.Abs(float64(qOut[i]-qRef[i])) > 1e-4 { t.Errorf("Q mismatch at %d: got %f, want %f", i, qOut[i], qRef[i]) break } if math.Abs(float64(kOut[i]-kRef[i])) > 1e-4 { t.Errorf("K mismatch at %d: got %f, want %f", i, kOut[i], kRef[i]) break } } } func TestSigmoid(t *testing.T) { if !Available() { t.Skip("CUDA not available") } n := 1024 input := make([]float32, n) for i := range input { input[i] = float32(i-512) / 100.0 } // CPU reference ref := make([]float32, n) copy(ref, input) nn.SigmoidInplace(ref) // GPU dev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0) defer dev.Free() dev.CopyFrom(input) if err := Sigmoid(dev.Data().(unsafe.Pointer), n, 0); err != nil { t.Fatal(err) } out := make([]float32, n) dev.CopyToHost(out) for i := 0; i < n; i++ { if math.Abs(float64(out[i]-ref[i])) > 1e-5 { t.Errorf("Sigmoid mismatch at %d: got %f, want %f", i, out[i], ref[i]) break } } } func TestSoftmaxRows(t *testing.T) { if !Available() { t.Skip("CUDA not available") } rows, cols := 16, 64 n := rows * cols input := make([]float32, n) for i := range input { input[i] = float32(i%100) / 50.0 } // CPU reference (manual softmax per row) ref := make([]float32, n) copy(ref, input) for r := 0; r < rows; r++ { row := ref[r*cols : (r+1)*cols] maxVal := row[0] for _, v := range row { if v > maxVal { maxVal = v } } sum := float32(0) for i := range row { row[i] = float32(math.Exp(float64(row[i] - maxVal))) sum += row[i] } for i := range row { row[i] /= sum } } // GPU dev, _ := NewTensor(tensor.Shape{rows, cols}, tensor.Float32, 0) defer dev.Free() dev.CopyFrom(input) if err := SoftmaxRows(dev.Data().(unsafe.Pointer), rows, cols, 0); err != nil { t.Fatal(err) } out := make([]float32, n) dev.CopyToHost(out) for i := 0; i < n; i++ { if math.Abs(float64(out[i]-ref[i])) > 1e-5 { t.Errorf("Softmax mismatch at %d: got %f, want %f", i, out[i], ref[i]) break } } } func TestTopKPerRow(t *testing.T) { if !Available() { t.Skip("CUDA not available") } rows, cols, k := 4, 16, 3 scores := make([]float32, rows*cols) for i := range scores { scores[i] = float32(i % cols) } // Set some specific values scores[0*cols+5] = 100 scores[0*cols+10] = 90 scores[0*cols+2] = 80 scores[1*cols+15] = 50 scores[1*cols+0] = 40 scores[1*cols+7] = 30 // GPU scoresDev, _ := NewTensor(tensor.Shape{rows, cols}, tensor.Float32, 0) defer scoresDev.Free() scoresDev.CopyFrom(scores) indicesDev, _ := NewTensor(tensor.Shape{rows, k}, tensor.Int32, 0) defer indicesDev.Free() valuesDev, _ := NewTensor(tensor.Shape{rows, k}, tensor.Float32, 0) defer valuesDev.Free() if err := TopKPerRow(scoresDev.Data().(unsafe.Pointer), indicesDev.Data().(unsafe.Pointer), valuesDev.Data().(unsafe.Pointer), rows, cols, k, 0); err != nil { t.Fatal(err) } indices := make([]int32, rows*k) values := make([]float32, rows*k) indicesDev.CopyToInt32(indices) valuesDev.CopyToHost(values) // Check first row: should be indices 5, 10, 2 with values 100, 90, 80 if indices[0] != 5 || indices[1] != 10 || indices[2] != 2 { t.Errorf("Row 0 indices: got %v, want [5, 10, 2]", indices[0:3]) } if values[0] != 100 || values[1] != 90 || values[2] != 80 { t.Errorf("Row 0 values: got %v, want [100, 90, 80]", values[0:3]) } // Check second row: should be indices 15, 0, 7 with values 50, 40, 30 if indices[3] != 15 || indices[4] != 0 || indices[5] != 7 { t.Errorf("Row 1 indices: got %v, want [15, 0, 7]", indices[3:6]) } } func TestRMSNormGated(t *testing.T) { if !Available() { t.Skip("CUDA not available") } numHeads, headDim := 8, 64 n := numHeads * headDim eps := float32(1e-5) out := make([]float32, n) g := make([]float32, n) weight := make([]float32, headDim) for i := range out { out[i] = float32(i%100) / 50.0 g[i] = float32((i+13)%100) / 100.0 } for i := range weight { weight[i] = 1.0 + float32(i)/float32(headDim) } // CPU reference ref := make([]float32, n) copy(ref, out) nn.RMSNormGated(ref, g, weight, headDim, eps) // GPU outDev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0) defer outDev.Free() outDev.CopyFrom(out) gDev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0) defer gDev.Free() gDev.CopyFrom(g) weightDev, _ := NewTensor(tensor.Shape{headDim}, tensor.Float32, 0) defer weightDev.Free() weightDev.CopyFrom(weight) if err := RMSNormGated(outDev.Data().(unsafe.Pointer), gDev.Data().(unsafe.Pointer), weightDev.Data().(unsafe.Pointer), n, headDim, eps, 0); err != nil { t.Fatal(err) } result := make([]float32, n) outDev.CopyToHost(result) for i := 0; i < n; i++ { if math.Abs(float64(result[i]-ref[i])) > 1e-4 { t.Errorf("RMSNormGated mismatch at %d: got %f, want %f", i, result[i], ref[i]) break } } }