1
0

engine.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. package engine
  2. import (
  3. "context"
  4. "fmt"
  5. "log"
  6. "math"
  7. "sort"
  8. "strings"
  9. "makarna/pkg/backend/cpu"
  10. "makarna/pkg/backend/device"
  11. "makarna/pkg/compute"
  12. "makarna/pkg/loader"
  13. "makarna/pkg/model"
  14. _ "makarna/pkg/model/models"
  15. "makarna/pkg/tensor"
  16. )
  17. // Config configures the inference engine.
  18. type Config struct {
  19. // GPULayers specifies how many layers to place on GPU (-1 = auto, 0 = CPU only)
  20. GPULayers int
  21. // GPUBudget is the fraction of GPU memory to use (0.0-1.0, default 0.9)
  22. GPUBudget float64
  23. // GPUDevices lists GPU ordinals to use.
  24. // In auto mode (GPULayers=-1), nil/empty means "use all visible GPUs".
  25. // In explicit mode (GPULayers>0), nil/empty means "use all visible GPUs".
  26. GPUDevices []int
  27. // UseMmap controls whether the model file is memory-mapped.
  28. // Default is false (load tensors into RAM).
  29. UseMmap bool
  30. // CPUMoE keeps MoE expert weights on CPU to save GPU memory.
  31. // Expert computations are done by uploading activations to GPU per-expert.
  32. // Similar to llama.cpp's --moe-cpu flag.
  33. CPUMoE bool
  34. }
  35. // DefaultConfig returns a sensible default configuration.
  36. func DefaultConfig() Config {
  37. return Config{
  38. GPULayers: -1, // auto
  39. GPUBudget: 0.9,
  40. GPUDevices: nil,
  41. UseMmap: false,
  42. }
  43. }
  44. // Engine manages model execution with automatic device placement.
  45. type Engine struct {
  46. model model.Model
  47. loader *loader.ModelData
  48. dispatcher *device.DeviceDispatcher
  49. config Config
  50. }
  51. // Load loads a model with the given configuration.
  52. // Device placement is determined automatically based on config and available resources.
  53. func Load(path string, cfg Config) (*Engine, error) {
  54. // 1. Load model file
  55. md, err := loader.LoadWithOptions(path, loader.LoadOptions{UseMmap: cfg.UseMmap})
  56. if err != nil {
  57. return nil, fmt.Errorf("failed to load model file: %w", err)
  58. }
  59. // 2. Parse model config
  60. modelCfg, err := parseModelConfig(md)
  61. if err != nil {
  62. md.Close()
  63. return nil, err
  64. }
  65. // 3. Determine device placements
  66. placements := determinePlacements(md, &md.Metadata.ModelConfig, cfg)
  67. dispatcher := device.NewDeviceDispatcher(placements)
  68. // Log placement info
  69. gpuCount := dispatcher.NumGPULayers()
  70. cpuCount := len(placements) - gpuCount
  71. if gpuCount > 0 {
  72. log.Printf("Device placement: %d layers on GPU, %d layers on CPU", gpuCount, cpuCount)
  73. } else {
  74. log.Printf("Device placement: all %d layers on CPU", len(placements))
  75. }
  76. // 4. Create model
  77. mod, err := model.New(modelCfg.Architecture, modelCfg)
  78. if err != nil {
  79. md.Close()
  80. return nil, err
  81. }
  82. // 5. Load weights (currently all on CPU, lazy transfer to GPU)
  83. if err := loadWeights(md, mod, dispatcher); err != nil {
  84. md.Close()
  85. return nil, err
  86. }
  87. if v, ok := mod.(interface{ Validate() error }); ok {
  88. if err := v.Validate(); err != nil {
  89. md.Close()
  90. return nil, err
  91. }
  92. }
  93. return &Engine{
  94. model: mod,
  95. loader: md,
  96. dispatcher: dispatcher,
  97. config: cfg,
  98. }, nil
  99. }
  100. // Forward performs a forward pass through the model.
  101. // Device placement is handled transparently - the engine ensures tensors
  102. // are on the correct device for each layer automatically.
  103. func (e *Engine) Forward(ctx interface{}, input tensor.Tensor, positions tensor.Tensor, kvCache model.KVCache) (tensor.Tensor, error) {
  104. // Create context with dispatcher for device-aware operations
  105. var goCtx context.Context
  106. if ctx != nil {
  107. if c, ok := ctx.(context.Context); ok {
  108. goCtx = c
  109. } else {
  110. goCtx = context.Background()
  111. }
  112. } else {
  113. goCtx = context.Background()
  114. }
  115. goCtx = compute.WithDispatcher(goCtx, e.dispatcher)
  116. goCtx = compute.WithCPUMoE(goCtx, e.config.CPUMoE)
  117. return e.model.Forward(goCtx, input, positions, kvCache)
  118. }
  119. func (e *Engine) ForwardBatch(ctx interface{}, input tensor.Tensor, positions tensor.Tensor, kvCaches []model.KVCache) (tensor.Tensor, error) {
  120. // Create context with dispatcher for device-aware operations
  121. var goCtx context.Context
  122. if ctx != nil {
  123. if c, ok := ctx.(context.Context); ok {
  124. goCtx = c
  125. } else {
  126. goCtx = context.Background()
  127. }
  128. } else {
  129. goCtx = context.Background()
  130. }
  131. goCtx = compute.WithDispatcher(goCtx, e.dispatcher)
  132. goCtx = compute.WithCPUMoE(goCtx, e.config.CPUMoE)
  133. if m, ok := e.model.(model.BatchForwarder); ok {
  134. return m.ForwardBatch(goCtx, input, positions, kvCaches)
  135. }
  136. return nil, fmt.Errorf("model does not support ForwardBatch")
  137. }
  138. // Model returns the underlying model.
  139. func (e *Engine) Model() model.Model {
  140. return e.model
  141. }
  142. // Loader returns the model data loader.
  143. func (e *Engine) Loader() *loader.ModelData {
  144. return e.loader
  145. }
  146. // Dispatcher returns the device dispatcher for layer placement queries.
  147. func (e *Engine) Dispatcher() *device.DeviceDispatcher {
  148. return e.dispatcher
  149. }
  150. // ComputeContext creates a compute context for a specific layer.
  151. func (e *Engine) ComputeContext(layerIdx int) *compute.Context {
  152. return compute.NewContext(e.dispatcher, layerIdx)
  153. }
  154. // Close releases all resources.
  155. func (e *Engine) Close() error {
  156. if e.dispatcher != nil {
  157. e.dispatcher.Clear()
  158. }
  159. if e.loader != nil {
  160. return e.loader.Close()
  161. }
  162. return nil
  163. }
  164. func parseModelConfig(md *loader.ModelData) (*model.Config, error) {
  165. params := md.Metadata.ModelConfig.Params
  166. hiddenSize, _ := params["hidden_size"].(float64)
  167. numLayers, _ := params["num_hidden_layers"].(float64)
  168. numHeads, _ := params["num_attention_heads"].(float64)
  169. vocabSize, _ := params["vocab_size"].(float64)
  170. intermediate, _ := params["intermediate_size"].(float64)
  171. numKVHeads, _ := params["num_key_value_heads"].(float64)
  172. if numKVHeads == 0 {
  173. numKVHeads = numHeads
  174. }
  175. headDim, _ := params["head_dim"].(float64)
  176. if headDim == 0 {
  177. headDim = hiddenSize / numHeads
  178. }
  179. ropeTheta, _ := params["rope_theta"].(float64)
  180. if ropeTheta == 0 {
  181. ropeTheta = 10000.0
  182. }
  183. rmsNormEps, _ := params["rms_norm_eps"].(float64)
  184. if rmsNormEps == 0 {
  185. rmsNormEps = 1e-6
  186. }
  187. return &model.Config{
  188. Architecture: md.Metadata.ModelConfig.Architecture,
  189. HiddenSize: int(hiddenSize),
  190. NumLayers: int(numLayers),
  191. NumHeads: int(numHeads),
  192. NumKVHeads: int(numKVHeads),
  193. VocabSize: int(vocabSize),
  194. Intermediate: int(intermediate),
  195. HeadDim: int(headDim),
  196. RopeTheta: ropeTheta,
  197. RMSNormEps: rmsNormEps,
  198. Params: params,
  199. }, nil
  200. }
  201. func determinePlacements(md *loader.ModelData, loaderCfg *loader.ModelConfig, cfg Config) []tensor.DevicePlacement {
  202. numLayers := int(loaderCfg.Params["num_hidden_layers"].(float64))
  203. // CPU-only mode
  204. if cfg.GPULayers == 0 || !device.CUDAAvailable() {
  205. return makeCPUPlacements(numLayers)
  206. }
  207. // Explicit GPU layer count
  208. if cfg.GPULayers > 0 {
  209. return makeExplicitPlacementsFitBudget(md, loaderCfg, numLayers, cfg.GPULayers, cfg.GPUBudget, cfg.GPUDevices)
  210. }
  211. // Auto mode: use VRAM budget
  212. return PlanLayerDevices(md, loaderCfg, cfg.GPUBudget, cfg.GPUDevices)
  213. }
  214. func makeExplicitPlacements(numLayers, gpuLayers int, gpuDevices []int) []tensor.DevicePlacement {
  215. placements := make([]tensor.DevicePlacement, numLayers)
  216. if gpuLayers > numLayers {
  217. gpuLayers = numLayers
  218. }
  219. if len(gpuDevices) == 0 {
  220. gpuDevices = normalizedAutoGPUList(nil)
  221. if len(gpuDevices) == 0 {
  222. gpuDevices = []int{0}
  223. }
  224. }
  225. for i := range placements {
  226. placements[i] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
  227. }
  228. perDev := gpuLayers / len(gpuDevices)
  229. extra := gpuLayers % len(gpuDevices)
  230. start := numLayers - gpuLayers
  231. layer := start
  232. for di, dev := range gpuDevices {
  233. take := perDev
  234. if di < extra {
  235. take++
  236. }
  237. for j := 0; j < take && layer < numLayers; j++ {
  238. placements[layer] = tensor.DevicePlacement{Type: tensor.CUDA, GPU: dev}
  239. layer++
  240. }
  241. }
  242. return placements
  243. }
  244. func makeExplicitPlacementsFitBudget(md *loader.ModelData, loaderCfg *loader.ModelConfig, numLayers, gpuLayers int, budgetFraction float64, gpuDevices []int) []tensor.DevicePlacement {
  245. if gpuLayers <= 0 {
  246. return makeCPUPlacements(numLayers)
  247. }
  248. if budgetFraction <= 0 || budgetFraction > 1 {
  249. budgetFraction = 0.9
  250. }
  251. // If we can't query budgets, keep legacy behavior (may OOM, but avoids breaking CUDA-less builds).
  252. gpuList := normalizedAutoGPUList(gpuDevices)
  253. if len(gpuList) == 0 {
  254. gpuList = []int{0}
  255. }
  256. sort.Ints(gpuList)
  257. budgets := make(map[int]uint64, len(gpuList))
  258. for _, gpu := range gpuList {
  259. totalMem, freeMem, err := cudaMemoryInfoDeviceFn(gpu)
  260. if err != nil {
  261. continue
  262. }
  263. target := uint64(float64(totalMem) * budgetFraction)
  264. if target > freeMem {
  265. target = freeMem
  266. }
  267. if target <= defaultGPUReserveBytes {
  268. continue
  269. }
  270. target -= defaultGPUReserveBytes
  271. budgets[gpu] = target
  272. }
  273. if len(budgets) == 0 {
  274. return makeExplicitPlacements(numLayers, gpuLayers, gpuList)
  275. }
  276. layerWeights := estimateLayerWeightBytes(md, numLayers)
  277. layerCache := estimateLayerCacheBytes(loaderCfg, numLayers)
  278. layerNeed := make([]uint64, numLayers)
  279. for i := 0; i < numLayers; i++ {
  280. layerNeed[i] = layerWeights[i] + layerCache[i]
  281. }
  282. // Try from requested down to 1 until it fits.
  283. requested := gpuLayers
  284. for gpuLayers > 0 {
  285. placements := makeExplicitPlacements(numLayers, gpuLayers, gpuList)
  286. used := make(map[int]uint64, len(budgets))
  287. ok := true
  288. for layer, p := range placements {
  289. pp := p.Normalize()
  290. if pp.Type != tensor.CUDA || pp.GPU < 0 {
  291. continue
  292. }
  293. b, okBudget := budgets[pp.GPU]
  294. if !okBudget || b == 0 {
  295. ok = false
  296. break
  297. }
  298. used[pp.GPU] += layerNeed[layer]
  299. if used[pp.GPU] > b {
  300. ok = false
  301. break
  302. }
  303. }
  304. if ok {
  305. if gpuLayers < requested {
  306. log.Printf("placement: reducing explicit gpu_layers %d -> %d (budget=%.2f gpus=%v)", requested, gpuLayers, budgetFraction, gpuList)
  307. }
  308. return placements
  309. }
  310. gpuLayers--
  311. }
  312. log.Printf("placement: explicit gpu_layers=%d does not fit, falling back to CPU", requested)
  313. return makeCPUPlacements(numLayers)
  314. }
  315. func loadWeights(md *loader.ModelData, mod model.Model, dispatcher *device.DeviceDispatcher) error {
  316. type namedEntry struct {
  317. name string
  318. info loader.TensorEntry
  319. }
  320. ordered := make([]namedEntry, 0, len(md.Metadata.Tensors))
  321. for name, info := range md.Metadata.Tensors {
  322. ordered = append(ordered, namedEntry{name: name, info: info})
  323. }
  324. sort.Slice(ordered, func(i, j int) bool {
  325. return ordered[i].info.Offset < ordered[j].info.Offset
  326. })
  327. modelCfg := mod.Config()
  328. numLayers := 0
  329. lastOnGPU := false
  330. if modelCfg != nil && modelCfg.NumLayers > 0 {
  331. numLayers = modelCfg.NumLayers
  332. if dispatcher != nil {
  333. p := dispatcher.LayerPlacement(numLayers - 1).Normalize()
  334. lastOnGPU = p.Type == tensor.CUDA
  335. }
  336. }
  337. hasExplicitLMHead := false
  338. for n := range md.Metadata.Tensors {
  339. if strings.Contains(strings.ToLower(n), "lm_head") {
  340. hasExplicitLMHead = true
  341. break
  342. }
  343. }
  344. keepF16 := func(name string, shape tensor.Shape, onGPU bool) bool {
  345. if len(shape) != 2 {
  346. return false
  347. }
  348. ln := strings.ToLower(name)
  349. if strings.Contains(ln, "conv") || strings.Contains(ln, "a_log") || strings.Contains(ln, "dt_bias") || strings.Contains(ln, "o_norm") {
  350. return false
  351. }
  352. // Embedding is on CPU but we can keep fp16 (nn.Embedding handles it).
  353. if strings.Contains(ln, "embed_tokens") {
  354. // If lm_head is tied to embeddings and runs on CPU, we must keep F32.
  355. return hasExplicitLMHead || lastOnGPU
  356. }
  357. // Dense matmul weights for GPU layers can stay fp16 to avoid 2x RAM blow-up.
  358. return onGPU
  359. }
  360. for _, ent := range ordered {
  361. name := ent.name
  362. info := ent.info
  363. shape := make(tensor.Shape, len(info.Shape))
  364. for i, dim := range info.Shape {
  365. shape[i] = int(dim)
  366. }
  367. var dt tensor.DType
  368. decodeToF32 := false
  369. switch info.DType {
  370. case loader.F32:
  371. dt = tensor.Float32
  372. case loader.F16:
  373. dt = tensor.Float16
  374. case loader.BF16:
  375. dt = tensor.BFloat16
  376. decodeToF32 = true // keep conservative until BF16 matmul is supported
  377. case loader.Q4_K:
  378. dt = tensor.Q4_K
  379. case loader.Q3_K:
  380. dt = tensor.Q3_K
  381. case loader.Q5_K:
  382. dt = tensor.Q5_K
  383. case loader.Q6_K:
  384. dt = tensor.Q6_K
  385. case loader.Q8_K:
  386. dt = tensor.Q8_K
  387. case loader.Q2_K:
  388. dt = tensor.Q2_K
  389. default:
  390. dt = tensor.Float32
  391. }
  392. onGPU := false
  393. if dispatcher != nil && numLayers > 0 {
  394. if idx, ok := parseLayerIndex(name); ok && idx >= 0 && idx < numLayers {
  395. p := dispatcher.LayerPlacement(idx).Normalize()
  396. onGPU = p.Type == tensor.CUDA
  397. } else {
  398. ln := strings.ToLower(name)
  399. if strings.Contains(ln, "lm_head") || strings.Contains(ln, "output") || strings.Contains(ln, "model.norm") {
  400. onGPU = lastOnGPU
  401. }
  402. }
  403. }
  404. // Keep fp16 weights for GPU matmuls to avoid 2x RAM cost.
  405. if dt == tensor.Float16 {
  406. decodeToF32 = !keepF16(name, shape, onGPU)
  407. }
  408. tBytes, err := md.GetTensorData(name)
  409. if err != nil {
  410. return err
  411. }
  412. t, err := cpu.NewTensorFromBytes(shape, dt, tBytes)
  413. if err != nil {
  414. return fmt.Errorf("failed to create tensor %s: %v", name, err)
  415. }
  416. if decodeToF32 {
  417. u16 := t.DataUint16()
  418. out := make([]float32, shape.NumElements())
  419. switch dt {
  420. case tensor.Float16:
  421. for i := range out {
  422. out[i] = float16BitsToFloat32(u16[i])
  423. }
  424. case tensor.BFloat16:
  425. for i := range out {
  426. out[i] = bfloat16BitsToFloat32(u16[i])
  427. }
  428. }
  429. t = cpu.NewTensor(shape, out)
  430. }
  431. if err := mod.SetTensor(name, t); err != nil {
  432. // Unused tensor, ignore
  433. }
  434. }
  435. return nil
  436. }
  437. func float16BitsToFloat32(bits uint16) float32 {
  438. sign := uint32(bits&0x8000) << 16
  439. exp := int32((bits & 0x7C00) >> 10)
  440. mant := uint32(bits & 0x03FF)
  441. if exp == 0 {
  442. if mant == 0 {
  443. return math.Float32frombits(sign)
  444. }
  445. for mant&0x0400 == 0 {
  446. mant <<= 1
  447. exp--
  448. }
  449. exp++
  450. mant &= 0x03FF
  451. } else if exp == 0x1F {
  452. if mant == 0 {
  453. return math.Float32frombits(sign | 0x7F800000)
  454. }
  455. return math.Float32frombits(sign | 0x7FC00000)
  456. }
  457. exp = exp + (127 - 15)
  458. return math.Float32frombits(sign | (uint32(exp) << 23) | (mant << 13))
  459. }
  460. func bfloat16BitsToFloat32(bits uint16) float32 {
  461. return math.Float32frombits(uint32(bits) << 16)
  462. }
  463. // Legacy compatibility
  464. type DeviceConfig struct {
  465. LayerDevices []tensor.DevicePlacement
  466. }
  467. func LoadModel(path string, config DeviceConfig) (*Engine, error) {
  468. cfg := DefaultConfig()
  469. if len(config.LayerDevices) > 0 {
  470. // Count GPU layers
  471. gpuCount := 0
  472. for _, p := range config.LayerDevices {
  473. if p.Type == tensor.CUDA {
  474. gpuCount++
  475. }
  476. }
  477. cfg.GPULayers = gpuCount
  478. }
  479. return Load(path, cfg)
  480. }