moe.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. package compute
  2. import (
  3. "fmt"
  4. "unsafe"
  5. "makarna/pkg/backend/cpu"
  6. "makarna/pkg/backend/cpu/nn"
  7. "makarna/pkg/backend/cuda"
  8. "makarna/pkg/backend/device"
  9. "makarna/pkg/tensor"
  10. )
  11. type MoEConfig struct {
  12. NumExperts int
  13. TopK int
  14. IntermediateSize int
  15. RouterActivationFunc string // "softmax" or "sigmoid"
  16. UseGroupedTopK bool
  17. NumExpertGroup int
  18. TopKGroup int
  19. Renormalize bool
  20. RoutedScalingFactor float32
  21. NumSharedExperts int
  22. }
  23. type MoEWeights struct {
  24. GateW tensor.Tensor
  25. GateBias tensor.Tensor
  26. W1 []tensor.Tensor // gate projections per expert
  27. W2 []tensor.Tensor // down projections per expert
  28. W3 []tensor.Tensor // up projections per expert
  29. SharedGate, SharedUp, SharedDown tensor.Tensor
  30. }
  31. func HybridMoE(
  32. ctx *Context,
  33. hidden *Activation,
  34. weights *MoEWeights,
  35. cfg MoEConfig,
  36. out *Activation,
  37. ) error {
  38. // If CPUMoE is enabled, keep expert weights on CPU (saves GPU memory for large MoE models)
  39. if ctx != nil && ctx.CPUMoE {
  40. return hybridMoECPU(ctx, hidden, weights, cfg, out)
  41. }
  42. if ctx != nil && ctx.IsGPU() && hidden.IsGPU() && device.CUDAAvailable() && cuda.Available() {
  43. err := hybridMoEGPU(ctx, hidden, weights, cfg, out)
  44. if err == nil {
  45. return nil
  46. }
  47. // GPU failed (likely OOM), fallback to CPU
  48. }
  49. return hybridMoECPU(ctx, hidden, weights, cfg, out)
  50. }
  51. func hybridMoEGPU(
  52. ctx *Context,
  53. hidden *Activation,
  54. weights *MoEWeights,
  55. cfg MoEConfig,
  56. out *Activation,
  57. ) error {
  58. gpu := ctx.Placement().GPU
  59. seqLen := hidden.Shape()[0]
  60. hiddenSize := hidden.Shape()[1]
  61. // 1. Router: hidden -> gate scores
  62. gateAct, _ := NewActivation(tensor.Shape{seqLen, cfg.NumExperts}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
  63. defer FreeActivation(gateAct)
  64. if err := HybridLinear(ctx, hidden, weights.GateW, gateAct); err != nil {
  65. return err
  66. }
  67. gateCUDA, _ := gateAct.AsCUDA(gpu)
  68. gatePtr := gateCUDA.Data().(unsafe.Pointer)
  69. // 2. Apply router activation (softmax or sigmoid)
  70. if cfg.RouterActivationFunc == "softmax" {
  71. if err := cuda.SoftmaxRows(gatePtr, seqLen, cfg.NumExperts, gpu); err != nil {
  72. return err
  73. }
  74. } else {
  75. if err := cuda.Sigmoid(gatePtr, seqLen*cfg.NumExperts, gpu); err != nil {
  76. return err
  77. }
  78. }
  79. // 3. Add bias and TopK selection
  80. // For now, download to CPU for complex routing logic, then upload results
  81. gateCPU, err := gateAct.AsCPU()
  82. if err != nil {
  83. return err
  84. }
  85. gateData := gateCPU.DataFloat32()
  86. biasCPU, ok := weights.GateBias.(*cpu.Tensor)
  87. if !ok {
  88. return fmt.Errorf("moe gate bias not cpu tensor")
  89. }
  90. biasData := biasCPU.DataFloat32()
  91. // Initialize output on GPU
  92. outCUDA, err := out.AsCUDA(gpu)
  93. if err != nil {
  94. return err
  95. }
  96. outCPU := make([]float32, seqLen*hiddenSize)
  97. hiddenCPU, err := hidden.AsCPU()
  98. if err != nil {
  99. return err
  100. }
  101. inData := hiddenCPU.DataFloat32()
  102. // PRE-ALLOCATE REUSABLE BUFFERS (critical for avoiding memory leak!)
  103. tokAct, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
  104. defer FreeActivation(tokAct)
  105. a1, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
  106. defer FreeActivation(a1)
  107. a3, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
  108. defer FreeActivation(a3)
  109. exOut, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
  110. defer FreeActivation(exOut)
  111. sharedGateBuf, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
  112. defer FreeActivation(sharedGateBuf)
  113. sharedUpBuf, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
  114. defer FreeActivation(sharedUpBuf)
  115. sharedOutBuf, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu})
  116. defer FreeActivation(sharedOutBuf)
  117. // Temp CPU tensor for token data upload
  118. tokCPU := cpu.NewTensor(tensor.Shape{1, hiddenSize}, nil)
  119. for t := 0; t < seqLen; t++ {
  120. row := gateData[t*cfg.NumExperts : (t+1)*cfg.NumExperts]
  121. scores := make([]float32, cfg.NumExperts)
  122. copy(scores, row)
  123. scoresForChoice := make([]float32, cfg.NumExperts)
  124. for i := 0; i < cfg.NumExperts; i++ {
  125. scoresForChoice[i] = scores[i] + biasData[i]
  126. }
  127. masked := scoresForChoice
  128. if cfg.UseGroupedTopK {
  129. masked = nn.GroupedTopKMask(scoresForChoice, cfg.NumExpertGroup, cfg.TopKGroup)
  130. }
  131. choices := nn.SelectTopKExperts(masked, cfg.TopK, scores)
  132. if cfg.Renormalize && len(choices) > 1 {
  133. nn.RenormalizeMoEWeights(choices)
  134. }
  135. nn.ScaleMoEWeights(choices, cfg.RoutedScalingFactor)
  136. // Copy token data to GPU (reuse tokAct buffer)
  137. copy(tokCPU.DataFloat32(), inData[t*hiddenSize:(t+1)*hiddenSize])
  138. if tokCUDA, err := tokAct.AsCUDA(gpu); err == nil {
  139. tokCUDA.CopyFrom(tokCPU.DataFloat32())
  140. }
  141. acc := outCPU[t*hiddenSize : (t+1)*hiddenSize]
  142. for _, ch := range choices {
  143. ex := ch.Idx
  144. if ex >= len(weights.W1) || ex >= len(weights.W2) || ex >= len(weights.W3) {
  145. continue
  146. }
  147. w1 := weights.W1[ex]
  148. w2 := weights.W2[ex]
  149. w3 := weights.W3[ex]
  150. if w1 == nil || w2 == nil || w3 == nil {
  151. continue
  152. }
  153. // Reuse pre-allocated buffers
  154. HybridLinear(ctx, tokAct, w1, a1)
  155. HybridLinear(ctx, tokAct, w3, a3)
  156. // SwiGLU on GPU: a1 = silu(a1) * a3
  157. HybridSiLU(ctx, a1)
  158. HybridMul(ctx, a1, a3)
  159. HybridLinear(ctx, a1, w2, exOut)
  160. exCPU, _ := exOut.AsCPU()
  161. wd := ch.Weight
  162. for j := 0; j < hiddenSize; j++ {
  163. acc[j] += wd * exCPU.DataFloat32()[j]
  164. }
  165. }
  166. // Shared experts
  167. if cfg.NumSharedExperts > 0 && weights.SharedGate != nil && weights.SharedUp != nil && weights.SharedDown != nil {
  168. HybridLinear(ctx, tokAct, weights.SharedGate, sharedGateBuf)
  169. HybridLinear(ctx, tokAct, weights.SharedUp, sharedUpBuf)
  170. // SwiGLU on GPU
  171. HybridSiLU(ctx, sharedGateBuf)
  172. HybridMul(ctx, sharedGateBuf, sharedUpBuf)
  173. HybridLinear(ctx, sharedGateBuf, weights.SharedDown, sharedOutBuf)
  174. sCPU, _ := sharedOutBuf.AsCPU()
  175. for j := 0; j < hiddenSize; j++ {
  176. acc[j] += sCPU.DataFloat32()[j]
  177. }
  178. }
  179. }
  180. // Upload result to GPU
  181. outCUDA.CopyFrom(outCPU)
  182. return nil
  183. }
  184. func hybridMoECPU(
  185. ctx *Context,
  186. hidden *Activation,
  187. weights *MoEWeights,
  188. cfg MoEConfig,
  189. out *Activation,
  190. ) error {
  191. seqLen := hidden.Shape()[0]
  192. hiddenSize := hidden.Shape()[1]
  193. gateAct, _ := NewActivation(tensor.Shape{seqLen, cfg.NumExperts}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
  194. if err := HybridLinear(ctx, hidden, weights.GateW, gateAct); err != nil {
  195. return err
  196. }
  197. gateCPU, _ := gateAct.AsCPU()
  198. biasCPU, ok := weights.GateBias.(*cpu.Tensor)
  199. if !ok {
  200. return fmt.Errorf("moe gate bias not cpu tensor")
  201. }
  202. outCPU, _ := out.AsCPU()
  203. outData := outCPU.DataFloat32()
  204. hiddenCPU, _ := hidden.AsCPU()
  205. inData := hiddenCPU.DataFloat32()
  206. // Zero out output buffer as we will accumulate into it
  207. for i := range outData {
  208. outData[i] = 0
  209. }
  210. for t := 0; t < seqLen; t++ {
  211. row := gateCPU.DataFloat32()[t*cfg.NumExperts : (t+1)*cfg.NumExperts]
  212. scores := make([]float32, cfg.NumExperts)
  213. for i := 0; i < cfg.NumExperts; i++ {
  214. if cfg.RouterActivationFunc == "softmax" {
  215. scores[i] = row[i]
  216. } else {
  217. scores[i] = nn.Sigmoid(row[i])
  218. }
  219. }
  220. if cfg.RouterActivationFunc == "softmax" {
  221. nn.SoftmaxInplaceSimple(scores)
  222. }
  223. scoresForChoice := make([]float32, cfg.NumExperts)
  224. for i := 0; i < cfg.NumExperts; i++ {
  225. scoresForChoice[i] = scores[i] + biasCPU.DataFloat32()[i]
  226. }
  227. masked := scoresForChoice
  228. if cfg.UseGroupedTopK {
  229. masked = nn.GroupedTopKMask(scoresForChoice, cfg.NumExpertGroup, cfg.TopKGroup)
  230. }
  231. choices := nn.SelectTopKExperts(masked, cfg.TopK, scores)
  232. if cfg.Renormalize && len(choices) > 1 {
  233. nn.RenormalizeMoEWeights(choices)
  234. }
  235. nn.ScaleMoEWeights(choices, cfg.RoutedScalingFactor)
  236. tok := cpu.NewTensor(tensor.Shape{1, hiddenSize}, nil)
  237. copy(tok.DataFloat32(), inData[t*hiddenSize:(t+1)*hiddenSize])
  238. tokAct := NewActivationFrom(tok)
  239. acc := outData[t*hiddenSize : (t+1)*hiddenSize]
  240. for _, ch := range choices {
  241. ex := ch.Idx
  242. if ex >= len(weights.W1) || ex >= len(weights.W2) || ex >= len(weights.W3) {
  243. continue
  244. }
  245. w1 := weights.W1[ex]
  246. w2 := weights.W2[ex]
  247. w3 := weights.W3[ex]
  248. if w1 == nil || w2 == nil || w3 == nil {
  249. continue
  250. }
  251. a1, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
  252. a3, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
  253. HybridLinear(ctx, tokAct, w1, a1)
  254. HybridLinear(ctx, tokAct, w3, a3)
  255. a1CPU, _ := a1.AsCPU()
  256. a3CPU, _ := a3.AsCPU()
  257. a1d := a1CPU.DataFloat32()
  258. a3d := a3CPU.DataFloat32()
  259. for j := 0; j < cfg.IntermediateSize; j++ {
  260. a1d[j] = a1d[j] * nn.Sigmoid(a1d[j])
  261. a1d[j] = a1d[j] * a3d[j]
  262. }
  263. mix := NewActivationFrom(a1CPU)
  264. exOut, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
  265. HybridLinear(ctx, mix, w2, exOut)
  266. exCPU, _ := exOut.AsCPU()
  267. wd := ch.Weight
  268. for j := 0; j < hiddenSize; j++ {
  269. acc[j] += wd * exCPU.DataFloat32()[j]
  270. }
  271. }
  272. if cfg.NumSharedExperts > 0 && weights.SharedGate != nil && weights.SharedUp != nil && weights.SharedDown != nil {
  273. sharedGate, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
  274. sharedUp, _ := NewActivation(tensor.Shape{1, cfg.IntermediateSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
  275. HybridLinear(ctx, tokAct, weights.SharedGate, sharedGate)
  276. HybridLinear(ctx, tokAct, weights.SharedUp, sharedUp)
  277. gCPU, _ := sharedGate.AsCPU()
  278. uCPU, _ := sharedUp.AsCPU()
  279. gD := gCPU.DataFloat32()
  280. uD := uCPU.DataFloat32()
  281. for j := 0; j < len(gD) && j < len(uD); j++ {
  282. gD[j] = gD[j] * nn.Sigmoid(gD[j])
  283. gD[j] = gD[j] * uD[j]
  284. }
  285. mix := NewActivationFrom(gCPU)
  286. sharedOut, _ := NewActivation(tensor.Shape{1, hiddenSize}, tensor.DevicePlacement{Type: tensor.CPU, GPU: -1})
  287. HybridLinear(ctx, mix, weights.SharedDown, sharedOut)
  288. sCPU, _ := sharedOut.AsCPU()
  289. for j := 0; j < hiddenSize; j++ {
  290. acc[j] += sCPU.DataFloat32()[j]
  291. }
  292. }
  293. }
  294. // Important: Copy result back to output activation (which might be on GPU)
  295. return out.CopyFrom(outCPU)
  296. }