placement.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. package engine
  2. import (
  3. "log"
  4. "sort"
  5. "strconv"
  6. "strings"
  7. "makarna/pkg/backend/cuda"
  8. "makarna/pkg/loader"
  9. "makarna/pkg/tensor"
  10. )
  11. var (
  12. cudaDeviceCountFn = cuda.DeviceCount
  13. cudaMemoryInfoDeviceFn = cuda.MemoryInfoDevice
  14. )
  15. const defaultGPUReserveBytes = uint64(1024 << 20) // 1GB headroom for scratch/overheads
  16. // PlanLayerDevices decides per-layer placement based on a GPU budget fraction.
  17. // If GPU info is unavailable, defaults to CPU.
  18. func PlanLayerDevices(md *loader.ModelData, cfg *loader.ModelConfig, budgetFraction float64, gpuDevices []int) []tensor.DevicePlacement {
  19. if budgetFraction <= 0 || budgetFraction > 1 {
  20. budgetFraction = 0.9
  21. }
  22. numLayers := int(cfg.Params["num_hidden_layers"].(float64))
  23. if numLayers <= 0 {
  24. return nil
  25. }
  26. gpus := normalizedAutoGPUList(gpuDevices)
  27. if len(gpus) == 0 {
  28. log.Printf("placement: no usable GPUs, defaulting to CPU")
  29. return makeCPUPlacements(numLayers)
  30. }
  31. budgets := make(map[int]uint64, len(gpus))
  32. var totalBudget uint64
  33. for _, gpu := range gpus {
  34. totalMem, freeMem, err := cudaMemoryInfoDeviceFn(gpu)
  35. if err != nil {
  36. log.Printf("placement: cuda mem info gpu=%d unavailable (%v), skipping", gpu, err)
  37. continue
  38. }
  39. target := uint64(float64(totalMem) * budgetFraction)
  40. if target > freeMem {
  41. target = freeMem
  42. }
  43. if target <= defaultGPUReserveBytes {
  44. log.Printf("placement: gpu=%d budget too small (%d bytes), skipping", gpu, target)
  45. continue
  46. }
  47. target -= defaultGPUReserveBytes
  48. budgets[gpu] = target
  49. totalBudget += target
  50. }
  51. if totalBudget == 0 {
  52. log.Printf("placement: no usable GPU budgets, defaulting to CPU")
  53. return makeCPUPlacements(numLayers)
  54. }
  55. layerWeights := estimateLayerWeightBytes(md, numLayers)
  56. layerCache := estimateLayerCacheBytes(cfg, numLayers)
  57. layerNeed := make([]uint64, numLayers)
  58. var totalNeed uint64
  59. for i := 0; i < numLayers; i++ {
  60. layerNeed[i] = layerWeights[i] + layerCache[i]
  61. totalNeed += layerNeed[i]
  62. }
  63. placements := makeCPUPlacements(numLayers)
  64. if totalNeed == 0 {
  65. return placements
  66. }
  67. // If everything fits in aggregate, distribute layers roughly proportionally to GPU budgets.
  68. if totalNeed <= totalBudget {
  69. // Build per-GPU target usage.
  70. targetByGPU := make(map[int]uint64, len(budgets))
  71. var ordered []int
  72. for _, gpu := range gpus {
  73. if b := budgets[gpu]; b > 0 {
  74. ordered = append(ordered, gpu)
  75. }
  76. }
  77. sort.Ints(ordered)
  78. var acc uint64
  79. for i, gpu := range ordered {
  80. b := budgets[gpu]
  81. share := uint64(float64(totalNeed) * (float64(b) / float64(totalBudget)))
  82. if i == len(ordered)-1 {
  83. share = totalNeed - acc
  84. }
  85. targetByGPU[gpu] = share
  86. acc += share
  87. }
  88. used := make(map[int]uint64, len(ordered))
  89. cur := 0
  90. for layer := numLayers - 1; layer >= 0; layer-- {
  91. need := layerNeed[layer]
  92. placed := false
  93. for cur < len(ordered) {
  94. gpu := ordered[cur]
  95. b := budgets[gpu]
  96. if b-used[gpu] < need {
  97. cur++
  98. continue
  99. }
  100. // Prefer moving to next GPU once we cross the proportional target, if next can fit.
  101. if cur < len(ordered)-1 && used[gpu]+need > targetByGPU[gpu] {
  102. next := ordered[cur+1]
  103. if budgets[next]-used[next] >= need {
  104. cur++
  105. continue
  106. }
  107. }
  108. placements[layer] = tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu}
  109. used[gpu] += need
  110. placed = true
  111. break
  112. }
  113. if !placed {
  114. placements[layer] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
  115. }
  116. }
  117. return placements
  118. }
  119. // Otherwise: greedy fill GPUs in order, spill the rest to CPU.
  120. ordered := make([]int, 0, len(budgets))
  121. for _, gpu := range gpus {
  122. if budgets[gpu] > 0 {
  123. ordered = append(ordered, gpu)
  124. }
  125. }
  126. sort.Ints(ordered)
  127. used := make(map[int]uint64, len(ordered))
  128. cur := 0
  129. for layer := numLayers - 1; layer >= 0; layer-- {
  130. need := layerNeed[layer]
  131. placed := false
  132. for cur < len(ordered) {
  133. gpu := ordered[cur]
  134. if budgets[gpu]-used[gpu] >= need {
  135. placements[layer] = tensor.DevicePlacement{Type: tensor.CUDA, GPU: gpu}
  136. used[gpu] += need
  137. placed = true
  138. break
  139. }
  140. cur++
  141. }
  142. if !placed {
  143. placements[layer] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
  144. }
  145. }
  146. return placements
  147. }
  148. func makeCPUPlacements(n int) []tensor.DevicePlacement {
  149. p := make([]tensor.DevicePlacement, n)
  150. for i := range p {
  151. p[i] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
  152. }
  153. return p
  154. }
  155. func normalizedAutoGPUList(gpuDevices []int) []int {
  156. seen := make(map[int]bool)
  157. out := make([]int, 0, len(gpuDevices))
  158. for _, g := range gpuDevices {
  159. if g < 0 {
  160. continue
  161. }
  162. if !seen[g] {
  163. seen[g] = true
  164. out = append(out, g)
  165. }
  166. }
  167. if len(out) > 0 {
  168. sort.Ints(out)
  169. return out
  170. }
  171. // Auto-detect all devices when not explicitly configured.
  172. n, err := cudaDeviceCountFn()
  173. if err != nil || n <= 0 {
  174. return nil
  175. }
  176. out = make([]int, 0, n)
  177. for i := 0; i < n; i++ {
  178. out = append(out, i)
  179. }
  180. return out
  181. }
  182. func estimateLayerWeightBytes(md *loader.ModelData, numLayers int) []uint64 {
  183. out := make([]uint64, numLayers)
  184. if md == nil {
  185. return out
  186. }
  187. for name, t := range md.Metadata.Tensors {
  188. sz := effectiveTensorBytes(t)
  189. if idx, ok := parseLayerIndex(name); ok && idx >= 0 && idx < numLayers {
  190. out[idx] += sz
  191. continue
  192. }
  193. ln := strings.ToLower(name)
  194. switch {
  195. case strings.Contains(ln, "embed_tokens") || strings.Contains(ln, "token_emb"):
  196. out[0] += sz
  197. case strings.Contains(ln, "lm_head") || strings.Contains(ln, "output"):
  198. out[numLayers-1] += sz
  199. default:
  200. // Treat as "shared"; add to last layer for budgeting.
  201. out[numLayers-1] += sz
  202. }
  203. }
  204. return out
  205. }
  206. func effectiveTensorBytes(t loader.TensorEntry) uint64 {
  207. // Placement needs to approximate the *runtime* footprint; prefer exact on-disk size
  208. // when available, fall back to dtype-based estimates otherwise.
  209. if t.Size > 0 {
  210. return t.Size
  211. }
  212. var elems uint64 = 1
  213. for _, d := range t.Shape {
  214. if d == 0 {
  215. return 0
  216. }
  217. elems *= d
  218. }
  219. if elems == 0 {
  220. return 0
  221. }
  222. switch t.DType {
  223. case loader.F16, loader.BF16:
  224. return elems * 2
  225. case loader.F32:
  226. return elems * 4
  227. }
  228. // Fallback: assume fp16 weight cache for large 2D quant matrices (matmul weights).
  229. if t.DType.Info().BlockSize > 0 && len(t.Shape) == 2 && t.Shape[0] > 1 && t.Shape[1] > 1 {
  230. return elems * 2
  231. }
  232. // Fallback to metadata bytes-per-element.
  233. bpe := float64(t.DType.Info().BytesPerEl)
  234. if bpe <= 0 {
  235. return 0
  236. }
  237. return uint64(float64(elems) * bpe)
  238. }
  239. func estimateLayerCacheBytes(cfg *loader.ModelConfig, numLayers int) []uint64 {
  240. out := make([]uint64, numLayers)
  241. if cfg == nil || cfg.Params == nil || numLayers <= 0 {
  242. return out
  243. }
  244. arch := strings.ToLower(cfg.Architecture)
  245. params := cfg.Params
  246. // KimiLinear: recurrent cache (per-layer constant state) + short conv states.
  247. if strings.Contains(arch, "kimi") {
  248. lac, _ := params["linear_attn_config"].(map[string]any)
  249. numHeads := intFromAny(lac["num_heads"], intFromAny(params["num_attention_heads"], 0))
  250. headDim := intFromAny(lac["head_dim"], 0)
  251. kernel := intFromAny(lac["short_conv_kernel_size"], 0)
  252. if numHeads <= 0 || headDim <= 0 {
  253. return out
  254. }
  255. if kernel <= 1 {
  256. kernel = 1
  257. }
  258. convLen := kernel - 1
  259. projSize := numHeads * headDim
  260. recurrentBytes := uint64(numHeads * headDim * headDim * 4)
  261. convBytes := uint64(projSize * convLen * 4)
  262. perLayer := recurrentBytes + 3*convBytes
  263. kdaLayers := parseIntListAny(lac["kda_layers"])
  264. if len(kdaLayers) == 0 {
  265. // Conservative: assume all layers use recurrent state.
  266. for i := range out {
  267. out[i] = perLayer
  268. }
  269. return out
  270. }
  271. for _, oneBased := range kdaLayers {
  272. idx := oneBased - 1
  273. if idx >= 0 && idx < len(out) {
  274. out[idx] = perLayer
  275. }
  276. }
  277. return out
  278. }
  279. // Default: paged KV cache with fp16 K/V (2 bytes per element).
  280. numHeads := intFromAny(params["num_attention_heads"], 0)
  281. numKVHeads := intFromAny(params["num_key_value_heads"], 0)
  282. if numKVHeads == 0 {
  283. numKVHeads = numHeads
  284. }
  285. hidden := intFromAny(params["hidden_size"], 0)
  286. headDim := intFromAny(params["head_dim"], 0)
  287. if headDim == 0 && numHeads > 0 {
  288. headDim = hidden / numHeads
  289. }
  290. maxSeq := intFromAny(params["max_position_embeddings"], 0)
  291. if maxSeq <= 0 {
  292. maxSeq = 4096
  293. }
  294. if numKVHeads <= 0 || headDim <= 0 {
  295. return out
  296. }
  297. kvDim := numKVHeads * headDim
  298. perLayer := uint64(maxSeq * kvDim * 4) // K+V fp16 => 2B + 2B = 4B
  299. for i := range out {
  300. out[i] = perLayer
  301. }
  302. return out
  303. }
  304. func parseLayerIndex(name string) (int, bool) {
  305. const p1 = "model.layers."
  306. if strings.HasPrefix(name, p1) {
  307. rest := name[len(p1):]
  308. dot := strings.IndexByte(rest, '.')
  309. if dot <= 0 {
  310. return 0, false
  311. }
  312. n, err := strconv.Atoi(rest[:dot])
  313. if err != nil {
  314. return 0, false
  315. }
  316. return n, true
  317. }
  318. const p2 = "layers."
  319. if strings.HasPrefix(name, p2) {
  320. rest := name[len(p2):]
  321. dot := strings.IndexByte(rest, '.')
  322. if dot <= 0 {
  323. return 0, false
  324. }
  325. n, err := strconv.Atoi(rest[:dot])
  326. if err != nil {
  327. return 0, false
  328. }
  329. return n, true
  330. }
  331. return 0, false
  332. }
  333. func intFromAny(v any, def int) int {
  334. switch x := v.(type) {
  335. case int:
  336. return x
  337. case int64:
  338. return int(x)
  339. case float64:
  340. return int(x)
  341. case float32:
  342. return int(x)
  343. case string:
  344. if n, err := strconv.Atoi(x); err == nil {
  345. return n
  346. }
  347. }
  348. return def
  349. }
  350. func parseIntListAny(v any) []int {
  351. arr, ok := v.([]any)
  352. if !ok {
  353. if f, ok := v.([]float64); ok {
  354. out := make([]int, len(f))
  355. for i := range f {
  356. out[i] = int(f[i])
  357. }
  358. return out
  359. }
  360. return nil
  361. }
  362. out := make([]int, 0, len(arr))
  363. for _, it := range arr {
  364. switch x := it.(type) {
  365. case float64:
  366. out = append(out, int(x))
  367. case int:
  368. out = append(out, x)
  369. }
  370. }
  371. return out
  372. }
  373. // cudaMemInfo queries free/total GPU memory; returns ok=false if CUDA not available.
  374. func cudaMemInfo() (total uint64, free uint64, err error) {
  375. return cudaMemoryInfo()
  376. }