| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254 |
- //go:build cuda
- package cuda
- import (
- "math"
- "testing"
- "unsafe"
- "makarna/pkg/backend/cpu/nn"
- "makarna/pkg/tensor"
- )
- func TestL2NormHeads(t *testing.T) {
- if !Available() {
- t.Skip("CUDA not available")
- }
- tokens, numHeads, headDim := 4, 8, 64
- n := tokens * numHeads * headDim
- eps := float32(1e-6)
- // CPU reference
- qCPU := make([]float32, n)
- kCPU := make([]float32, n)
- for i := range qCPU {
- qCPU[i] = float32(i%100) / 50.0
- kCPU[i] = float32((i+37)%100) / 50.0
- }
- qRef := make([]float32, n)
- kRef := make([]float32, n)
- copy(qRef, qCPU)
- copy(kRef, kCPU)
- nn.L2NormHeads(qRef, kRef, tokens, numHeads, headDim, eps)
- // GPU
- qDev, _ := NewTensor(tensor.Shape{tokens, numHeads * headDim}, tensor.Float32, 0)
- defer qDev.Free()
- kDev, _ := NewTensor(tensor.Shape{tokens, numHeads * headDim}, tensor.Float32, 0)
- defer kDev.Free()
- qDev.CopyFrom(qCPU)
- kDev.CopyFrom(kCPU)
- if err := L2NormHeads(qDev.Data().(unsafe.Pointer), kDev.Data().(unsafe.Pointer), tokens, numHeads, headDim, eps, 0); err != nil {
- t.Fatal(err)
- }
- qOut := make([]float32, n)
- kOut := make([]float32, n)
- qDev.CopyToHost(qOut)
- kDev.CopyToHost(kOut)
- for i := 0; i < n; i++ {
- if math.Abs(float64(qOut[i]-qRef[i])) > 1e-4 {
- t.Errorf("Q mismatch at %d: got %f, want %f", i, qOut[i], qRef[i])
- break
- }
- if math.Abs(float64(kOut[i]-kRef[i])) > 1e-4 {
- t.Errorf("K mismatch at %d: got %f, want %f", i, kOut[i], kRef[i])
- break
- }
- }
- }
- func TestSigmoid(t *testing.T) {
- if !Available() {
- t.Skip("CUDA not available")
- }
- n := 1024
- input := make([]float32, n)
- for i := range input {
- input[i] = float32(i-512) / 100.0
- }
- // CPU reference
- ref := make([]float32, n)
- copy(ref, input)
- nn.SigmoidInplace(ref)
- // GPU
- dev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0)
- defer dev.Free()
- dev.CopyFrom(input)
- if err := Sigmoid(dev.Data().(unsafe.Pointer), n, 0); err != nil {
- t.Fatal(err)
- }
- out := make([]float32, n)
- dev.CopyToHost(out)
- for i := 0; i < n; i++ {
- if math.Abs(float64(out[i]-ref[i])) > 1e-5 {
- t.Errorf("Sigmoid mismatch at %d: got %f, want %f", i, out[i], ref[i])
- break
- }
- }
- }
- func TestSoftmaxRows(t *testing.T) {
- if !Available() {
- t.Skip("CUDA not available")
- }
- rows, cols := 16, 64
- n := rows * cols
- input := make([]float32, n)
- for i := range input {
- input[i] = float32(i%100) / 50.0
- }
- // CPU reference (manual softmax per row)
- ref := make([]float32, n)
- copy(ref, input)
- for r := 0; r < rows; r++ {
- row := ref[r*cols : (r+1)*cols]
- maxVal := row[0]
- for _, v := range row {
- if v > maxVal {
- maxVal = v
- }
- }
- sum := float32(0)
- for i := range row {
- row[i] = float32(math.Exp(float64(row[i] - maxVal)))
- sum += row[i]
- }
- for i := range row {
- row[i] /= sum
- }
- }
- // GPU
- dev, _ := NewTensor(tensor.Shape{rows, cols}, tensor.Float32, 0)
- defer dev.Free()
- dev.CopyFrom(input)
- if err := SoftmaxRows(dev.Data().(unsafe.Pointer), rows, cols, 0); err != nil {
- t.Fatal(err)
- }
- out := make([]float32, n)
- dev.CopyToHost(out)
- for i := 0; i < n; i++ {
- if math.Abs(float64(out[i]-ref[i])) > 1e-5 {
- t.Errorf("Softmax mismatch at %d: got %f, want %f", i, out[i], ref[i])
- break
- }
- }
- }
- func TestTopKPerRow(t *testing.T) {
- if !Available() {
- t.Skip("CUDA not available")
- }
- rows, cols, k := 4, 16, 3
- scores := make([]float32, rows*cols)
- for i := range scores {
- scores[i] = float32(i % cols)
- }
- // Set some specific values
- scores[0*cols+5] = 100
- scores[0*cols+10] = 90
- scores[0*cols+2] = 80
- scores[1*cols+15] = 50
- scores[1*cols+0] = 40
- scores[1*cols+7] = 30
- // GPU
- scoresDev, _ := NewTensor(tensor.Shape{rows, cols}, tensor.Float32, 0)
- defer scoresDev.Free()
- scoresDev.CopyFrom(scores)
- indicesDev, _ := NewTensor(tensor.Shape{rows, k}, tensor.Int32, 0)
- defer indicesDev.Free()
- valuesDev, _ := NewTensor(tensor.Shape{rows, k}, tensor.Float32, 0)
- defer valuesDev.Free()
- if err := TopKPerRow(scoresDev.Data().(unsafe.Pointer), indicesDev.Data().(unsafe.Pointer), valuesDev.Data().(unsafe.Pointer), rows, cols, k, 0); err != nil {
- t.Fatal(err)
- }
- indices := make([]int32, rows*k)
- values := make([]float32, rows*k)
- indicesDev.CopyToInt32(indices)
- valuesDev.CopyToHost(values)
- // Check first row: should be indices 5, 10, 2 with values 100, 90, 80
- if indices[0] != 5 || indices[1] != 10 || indices[2] != 2 {
- t.Errorf("Row 0 indices: got %v, want [5, 10, 2]", indices[0:3])
- }
- if values[0] != 100 || values[1] != 90 || values[2] != 80 {
- t.Errorf("Row 0 values: got %v, want [100, 90, 80]", values[0:3])
- }
- // Check second row: should be indices 15, 0, 7 with values 50, 40, 30
- if indices[3] != 15 || indices[4] != 0 || indices[5] != 7 {
- t.Errorf("Row 1 indices: got %v, want [15, 0, 7]", indices[3:6])
- }
- }
- func TestRMSNormGated(t *testing.T) {
- if !Available() {
- t.Skip("CUDA not available")
- }
- numHeads, headDim := 8, 64
- n := numHeads * headDim
- eps := float32(1e-5)
- out := make([]float32, n)
- g := make([]float32, n)
- weight := make([]float32, headDim)
- for i := range out {
- out[i] = float32(i%100) / 50.0
- g[i] = float32((i+13)%100) / 100.0
- }
- for i := range weight {
- weight[i] = 1.0 + float32(i)/float32(headDim)
- }
- // CPU reference
- ref := make([]float32, n)
- copy(ref, out)
- nn.RMSNormGated(ref, g, weight, headDim, eps)
- // GPU
- outDev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0)
- defer outDev.Free()
- outDev.CopyFrom(out)
- gDev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0)
- defer gDev.Free()
- gDev.CopyFrom(g)
- weightDev, _ := NewTensor(tensor.Shape{headDim}, tensor.Float32, 0)
- defer weightDev.Free()
- weightDev.CopyFrom(weight)
- if err := RMSNormGated(outDev.Data().(unsafe.Pointer), gDev.Data().(unsafe.Pointer), weightDev.Data().(unsafe.Pointer), n, headDim, eps, 0); err != nil {
- t.Fatal(err)
- }
- result := make([]float32, n)
- outDev.CopyToHost(result)
- for i := 0; i < n; i++ {
- if math.Abs(float64(result[i]-ref[i])) > 1e-4 {
- t.Errorf("RMSNormGated mismatch at %d: got %f, want %f", i, result[i], ref[i])
- break
- }
- }
- }
|