cuda_kernels_test.go 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. //go:build cuda
  2. package cuda
  3. import (
  4. "math"
  5. "testing"
  6. "unsafe"
  7. "makarna/pkg/backend/cpu/nn"
  8. "makarna/pkg/tensor"
  9. )
  10. func TestL2NormHeads(t *testing.T) {
  11. if !Available() {
  12. t.Skip("CUDA not available")
  13. }
  14. tokens, numHeads, headDim := 4, 8, 64
  15. n := tokens * numHeads * headDim
  16. eps := float32(1e-6)
  17. // CPU reference
  18. qCPU := make([]float32, n)
  19. kCPU := make([]float32, n)
  20. for i := range qCPU {
  21. qCPU[i] = float32(i%100) / 50.0
  22. kCPU[i] = float32((i+37)%100) / 50.0
  23. }
  24. qRef := make([]float32, n)
  25. kRef := make([]float32, n)
  26. copy(qRef, qCPU)
  27. copy(kRef, kCPU)
  28. nn.L2NormHeads(qRef, kRef, tokens, numHeads, headDim, eps)
  29. // GPU
  30. qDev, _ := NewTensor(tensor.Shape{tokens, numHeads * headDim}, tensor.Float32, 0)
  31. defer qDev.Free()
  32. kDev, _ := NewTensor(tensor.Shape{tokens, numHeads * headDim}, tensor.Float32, 0)
  33. defer kDev.Free()
  34. qDev.CopyFrom(qCPU)
  35. kDev.CopyFrom(kCPU)
  36. if err := L2NormHeads(qDev.Data().(unsafe.Pointer), kDev.Data().(unsafe.Pointer), tokens, numHeads, headDim, eps, 0); err != nil {
  37. t.Fatal(err)
  38. }
  39. qOut := make([]float32, n)
  40. kOut := make([]float32, n)
  41. qDev.CopyToHost(qOut)
  42. kDev.CopyToHost(kOut)
  43. for i := 0; i < n; i++ {
  44. if math.Abs(float64(qOut[i]-qRef[i])) > 1e-4 {
  45. t.Errorf("Q mismatch at %d: got %f, want %f", i, qOut[i], qRef[i])
  46. break
  47. }
  48. if math.Abs(float64(kOut[i]-kRef[i])) > 1e-4 {
  49. t.Errorf("K mismatch at %d: got %f, want %f", i, kOut[i], kRef[i])
  50. break
  51. }
  52. }
  53. }
  54. func TestSigmoid(t *testing.T) {
  55. if !Available() {
  56. t.Skip("CUDA not available")
  57. }
  58. n := 1024
  59. input := make([]float32, n)
  60. for i := range input {
  61. input[i] = float32(i-512) / 100.0
  62. }
  63. // CPU reference
  64. ref := make([]float32, n)
  65. copy(ref, input)
  66. nn.SigmoidInplace(ref)
  67. // GPU
  68. dev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0)
  69. defer dev.Free()
  70. dev.CopyFrom(input)
  71. if err := Sigmoid(dev.Data().(unsafe.Pointer), n, 0); err != nil {
  72. t.Fatal(err)
  73. }
  74. out := make([]float32, n)
  75. dev.CopyToHost(out)
  76. for i := 0; i < n; i++ {
  77. if math.Abs(float64(out[i]-ref[i])) > 1e-5 {
  78. t.Errorf("Sigmoid mismatch at %d: got %f, want %f", i, out[i], ref[i])
  79. break
  80. }
  81. }
  82. }
  83. func TestSoftmaxRows(t *testing.T) {
  84. if !Available() {
  85. t.Skip("CUDA not available")
  86. }
  87. rows, cols := 16, 64
  88. n := rows * cols
  89. input := make([]float32, n)
  90. for i := range input {
  91. input[i] = float32(i%100) / 50.0
  92. }
  93. // CPU reference (manual softmax per row)
  94. ref := make([]float32, n)
  95. copy(ref, input)
  96. for r := 0; r < rows; r++ {
  97. row := ref[r*cols : (r+1)*cols]
  98. maxVal := row[0]
  99. for _, v := range row {
  100. if v > maxVal {
  101. maxVal = v
  102. }
  103. }
  104. sum := float32(0)
  105. for i := range row {
  106. row[i] = float32(math.Exp(float64(row[i] - maxVal)))
  107. sum += row[i]
  108. }
  109. for i := range row {
  110. row[i] /= sum
  111. }
  112. }
  113. // GPU
  114. dev, _ := NewTensor(tensor.Shape{rows, cols}, tensor.Float32, 0)
  115. defer dev.Free()
  116. dev.CopyFrom(input)
  117. if err := SoftmaxRows(dev.Data().(unsafe.Pointer), rows, cols, 0); err != nil {
  118. t.Fatal(err)
  119. }
  120. out := make([]float32, n)
  121. dev.CopyToHost(out)
  122. for i := 0; i < n; i++ {
  123. if math.Abs(float64(out[i]-ref[i])) > 1e-5 {
  124. t.Errorf("Softmax mismatch at %d: got %f, want %f", i, out[i], ref[i])
  125. break
  126. }
  127. }
  128. }
  129. func TestTopKPerRow(t *testing.T) {
  130. if !Available() {
  131. t.Skip("CUDA not available")
  132. }
  133. rows, cols, k := 4, 16, 3
  134. scores := make([]float32, rows*cols)
  135. for i := range scores {
  136. scores[i] = float32(i % cols)
  137. }
  138. // Set some specific values
  139. scores[0*cols+5] = 100
  140. scores[0*cols+10] = 90
  141. scores[0*cols+2] = 80
  142. scores[1*cols+15] = 50
  143. scores[1*cols+0] = 40
  144. scores[1*cols+7] = 30
  145. // GPU
  146. scoresDev, _ := NewTensor(tensor.Shape{rows, cols}, tensor.Float32, 0)
  147. defer scoresDev.Free()
  148. scoresDev.CopyFrom(scores)
  149. indicesDev, _ := NewTensor(tensor.Shape{rows, k}, tensor.Int32, 0)
  150. defer indicesDev.Free()
  151. valuesDev, _ := NewTensor(tensor.Shape{rows, k}, tensor.Float32, 0)
  152. defer valuesDev.Free()
  153. if err := TopKPerRow(scoresDev.Data().(unsafe.Pointer), indicesDev.Data().(unsafe.Pointer), valuesDev.Data().(unsafe.Pointer), rows, cols, k, 0); err != nil {
  154. t.Fatal(err)
  155. }
  156. indices := make([]int32, rows*k)
  157. values := make([]float32, rows*k)
  158. indicesDev.CopyToInt32(indices)
  159. valuesDev.CopyToHost(values)
  160. // Check first row: should be indices 5, 10, 2 with values 100, 90, 80
  161. if indices[0] != 5 || indices[1] != 10 || indices[2] != 2 {
  162. t.Errorf("Row 0 indices: got %v, want [5, 10, 2]", indices[0:3])
  163. }
  164. if values[0] != 100 || values[1] != 90 || values[2] != 80 {
  165. t.Errorf("Row 0 values: got %v, want [100, 90, 80]", values[0:3])
  166. }
  167. // Check second row: should be indices 15, 0, 7 with values 50, 40, 30
  168. if indices[3] != 15 || indices[4] != 0 || indices[5] != 7 {
  169. t.Errorf("Row 1 indices: got %v, want [15, 0, 7]", indices[3:6])
  170. }
  171. }
  172. func TestRMSNormGated(t *testing.T) {
  173. if !Available() {
  174. t.Skip("CUDA not available")
  175. }
  176. numHeads, headDim := 8, 64
  177. n := numHeads * headDim
  178. eps := float32(1e-5)
  179. out := make([]float32, n)
  180. g := make([]float32, n)
  181. weight := make([]float32, headDim)
  182. for i := range out {
  183. out[i] = float32(i%100) / 50.0
  184. g[i] = float32((i+13)%100) / 100.0
  185. }
  186. for i := range weight {
  187. weight[i] = 1.0 + float32(i)/float32(headDim)
  188. }
  189. // CPU reference
  190. ref := make([]float32, n)
  191. copy(ref, out)
  192. nn.RMSNormGated(ref, g, weight, headDim, eps)
  193. // GPU
  194. outDev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0)
  195. defer outDev.Free()
  196. outDev.CopyFrom(out)
  197. gDev, _ := NewTensor(tensor.Shape{n}, tensor.Float32, 0)
  198. defer gDev.Free()
  199. gDev.CopyFrom(g)
  200. weightDev, _ := NewTensor(tensor.Shape{headDim}, tensor.Float32, 0)
  201. defer weightDev.Free()
  202. weightDev.CopyFrom(weight)
  203. if err := RMSNormGated(outDev.Data().(unsafe.Pointer), gDev.Data().(unsafe.Pointer), weightDev.Data().(unsafe.Pointer), n, headDim, eps, 0); err != nil {
  204. t.Fatal(err)
  205. }
  206. result := make([]float32, n)
  207. outDev.CopyToHost(result)
  208. for i := 0; i < n; i++ {
  209. if math.Abs(float64(result[i]-ref[i])) > 1e-4 {
  210. t.Errorf("RMSNormGated mismatch at %d: got %f, want %f", i, result[i], ref[i])
  211. break
  212. }
  213. }
  214. }