package engine import ( "context" "fmt" "log" "math" "sort" "strings" "makarna/pkg/backend/cpu" "makarna/pkg/backend/device" "makarna/pkg/compute" "makarna/pkg/loader" "makarna/pkg/model" _ "makarna/pkg/model/models" "makarna/pkg/tensor" ) // Config configures the inference engine. type Config struct { // GPULayers specifies how many layers to place on GPU (-1 = auto, 0 = CPU only) GPULayers int // GPUBudget is the fraction of GPU memory to use (0.0-1.0, default 0.9) GPUBudget float64 // GPUDevices lists GPU ordinals to use. // In auto mode (GPULayers=-1), nil/empty means "use all visible GPUs". // In explicit mode (GPULayers>0), nil/empty means "use all visible GPUs". GPUDevices []int // UseMmap controls whether the model file is memory-mapped. // Default is false (load tensors into RAM). UseMmap bool // CPUMoE keeps MoE expert weights on CPU to save GPU memory. // Expert computations are done by uploading activations to GPU per-expert. // Similar to llama.cpp's --moe-cpu flag. CPUMoE bool } // DefaultConfig returns a sensible default configuration. func DefaultConfig() Config { return Config{ GPULayers: -1, // auto GPUBudget: 0.9, GPUDevices: nil, UseMmap: false, } } // Engine manages model execution with automatic device placement. type Engine struct { model model.Model loader *loader.ModelData dispatcher *device.DeviceDispatcher config Config } // Load loads a model with the given configuration. // Device placement is determined automatically based on config and available resources. func Load(path string, cfg Config) (*Engine, error) { // 1. Load model file md, err := loader.LoadWithOptions(path, loader.LoadOptions{UseMmap: cfg.UseMmap}) if err != nil { return nil, fmt.Errorf("failed to load model file: %w", err) } // 2. Parse model config modelCfg, err := parseModelConfig(md) if err != nil { md.Close() return nil, err } // 3. Determine device placements placements := determinePlacements(md, &md.Metadata.ModelConfig, cfg) dispatcher := device.NewDeviceDispatcher(placements) // Log placement info gpuCount := dispatcher.NumGPULayers() cpuCount := len(placements) - gpuCount if gpuCount > 0 { log.Printf("Device placement: %d layers on GPU, %d layers on CPU", gpuCount, cpuCount) } else { log.Printf("Device placement: all %d layers on CPU", len(placements)) } // 4. Create model mod, err := model.New(modelCfg.Architecture, modelCfg) if err != nil { md.Close() return nil, err } // 5. Load weights (currently all on CPU, lazy transfer to GPU) if err := loadWeights(md, mod, dispatcher); err != nil { md.Close() return nil, err } if v, ok := mod.(interface{ Validate() error }); ok { if err := v.Validate(); err != nil { md.Close() return nil, err } } return &Engine{ model: mod, loader: md, dispatcher: dispatcher, config: cfg, }, nil } // Forward performs a forward pass through the model. // Device placement is handled transparently - the engine ensures tensors // are on the correct device for each layer automatically. func (e *Engine) Forward(ctx interface{}, input tensor.Tensor, positions tensor.Tensor, kvCache model.KVCache) (tensor.Tensor, error) { // Create context with dispatcher for device-aware operations var goCtx context.Context if ctx != nil { if c, ok := ctx.(context.Context); ok { goCtx = c } else { goCtx = context.Background() } } else { goCtx = context.Background() } goCtx = compute.WithDispatcher(goCtx, e.dispatcher) goCtx = compute.WithCPUMoE(goCtx, e.config.CPUMoE) return e.model.Forward(goCtx, input, positions, kvCache) } func (e *Engine) ForwardBatch(ctx interface{}, input tensor.Tensor, positions tensor.Tensor, kvCaches []model.KVCache) (tensor.Tensor, error) { // Create context with dispatcher for device-aware operations var goCtx context.Context if ctx != nil { if c, ok := ctx.(context.Context); ok { goCtx = c } else { goCtx = context.Background() } } else { goCtx = context.Background() } goCtx = compute.WithDispatcher(goCtx, e.dispatcher) goCtx = compute.WithCPUMoE(goCtx, e.config.CPUMoE) if m, ok := e.model.(model.BatchForwarder); ok { return m.ForwardBatch(goCtx, input, positions, kvCaches) } return nil, fmt.Errorf("model does not support ForwardBatch") } // Model returns the underlying model. func (e *Engine) Model() model.Model { return e.model } // Loader returns the model data loader. func (e *Engine) Loader() *loader.ModelData { return e.loader } // Dispatcher returns the device dispatcher for layer placement queries. func (e *Engine) Dispatcher() *device.DeviceDispatcher { return e.dispatcher } // ComputeContext creates a compute context for a specific layer. func (e *Engine) ComputeContext(layerIdx int) *compute.Context { return compute.NewContext(e.dispatcher, layerIdx) } // Close releases all resources. func (e *Engine) Close() error { if e.dispatcher != nil { e.dispatcher.Clear() } if e.loader != nil { return e.loader.Close() } return nil } func parseModelConfig(md *loader.ModelData) (*model.Config, error) { params := md.Metadata.ModelConfig.Params hiddenSize, _ := params["hidden_size"].(float64) numLayers, _ := params["num_hidden_layers"].(float64) numHeads, _ := params["num_attention_heads"].(float64) vocabSize, _ := params["vocab_size"].(float64) intermediate, _ := params["intermediate_size"].(float64) numKVHeads, _ := params["num_key_value_heads"].(float64) if numKVHeads == 0 { numKVHeads = numHeads } headDim, _ := params["head_dim"].(float64) if headDim == 0 { headDim = hiddenSize / numHeads } ropeTheta, _ := params["rope_theta"].(float64) if ropeTheta == 0 { ropeTheta = 10000.0 } rmsNormEps, _ := params["rms_norm_eps"].(float64) if rmsNormEps == 0 { rmsNormEps = 1e-6 } return &model.Config{ Architecture: md.Metadata.ModelConfig.Architecture, HiddenSize: int(hiddenSize), NumLayers: int(numLayers), NumHeads: int(numHeads), NumKVHeads: int(numKVHeads), VocabSize: int(vocabSize), Intermediate: int(intermediate), HeadDim: int(headDim), RopeTheta: ropeTheta, RMSNormEps: rmsNormEps, Params: params, }, nil } func determinePlacements(md *loader.ModelData, loaderCfg *loader.ModelConfig, cfg Config) []tensor.DevicePlacement { numLayers := int(loaderCfg.Params["num_hidden_layers"].(float64)) // CPU-only mode if cfg.GPULayers == 0 || !device.CUDAAvailable() { return makeCPUPlacements(numLayers) } // Explicit GPU layer count if cfg.GPULayers > 0 { return makeExplicitPlacementsFitBudget(md, loaderCfg, numLayers, cfg.GPULayers, cfg.GPUBudget, cfg.GPUDevices) } // Auto mode: use VRAM budget return PlanLayerDevices(md, loaderCfg, cfg.GPUBudget, cfg.GPUDevices) } func makeExplicitPlacements(numLayers, gpuLayers int, gpuDevices []int) []tensor.DevicePlacement { placements := make([]tensor.DevicePlacement, numLayers) if gpuLayers > numLayers { gpuLayers = numLayers } if len(gpuDevices) == 0 { gpuDevices = normalizedAutoGPUList(nil) if len(gpuDevices) == 0 { gpuDevices = []int{0} } } for i := range placements { placements[i] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1} } perDev := gpuLayers / len(gpuDevices) extra := gpuLayers % len(gpuDevices) start := numLayers - gpuLayers layer := start for di, dev := range gpuDevices { take := perDev if di < extra { take++ } for j := 0; j < take && layer < numLayers; j++ { placements[layer] = tensor.DevicePlacement{Type: tensor.CUDA, GPU: dev} layer++ } } return placements } func makeExplicitPlacementsFitBudget(md *loader.ModelData, loaderCfg *loader.ModelConfig, numLayers, gpuLayers int, budgetFraction float64, gpuDevices []int) []tensor.DevicePlacement { if gpuLayers <= 0 { return makeCPUPlacements(numLayers) } if budgetFraction <= 0 || budgetFraction > 1 { budgetFraction = 0.9 } // If we can't query budgets, keep legacy behavior (may OOM, but avoids breaking CUDA-less builds). gpuList := normalizedAutoGPUList(gpuDevices) if len(gpuList) == 0 { gpuList = []int{0} } sort.Ints(gpuList) budgets := make(map[int]uint64, len(gpuList)) for _, gpu := range gpuList { totalMem, freeMem, err := cudaMemoryInfoDeviceFn(gpu) if err != nil { continue } target := uint64(float64(totalMem) * budgetFraction) if target > freeMem { target = freeMem } if target <= defaultGPUReserveBytes { continue } target -= defaultGPUReserveBytes budgets[gpu] = target } if len(budgets) == 0 { return makeExplicitPlacements(numLayers, gpuLayers, gpuList) } layerWeights := estimateLayerWeightBytes(md, numLayers) layerCache := estimateLayerCacheBytes(loaderCfg, numLayers) layerNeed := make([]uint64, numLayers) for i := 0; i < numLayers; i++ { layerNeed[i] = layerWeights[i] + layerCache[i] } // Try from requested down to 1 until it fits. requested := gpuLayers for gpuLayers > 0 { placements := makeExplicitPlacements(numLayers, gpuLayers, gpuList) used := make(map[int]uint64, len(budgets)) ok := true for layer, p := range placements { pp := p.Normalize() if pp.Type != tensor.CUDA || pp.GPU < 0 { continue } b, okBudget := budgets[pp.GPU] if !okBudget || b == 0 { ok = false break } used[pp.GPU] += layerNeed[layer] if used[pp.GPU] > b { ok = false break } } if ok { if gpuLayers < requested { log.Printf("placement: reducing explicit gpu_layers %d -> %d (budget=%.2f gpus=%v)", requested, gpuLayers, budgetFraction, gpuList) } return placements } gpuLayers-- } log.Printf("placement: explicit gpu_layers=%d does not fit, falling back to CPU", requested) return makeCPUPlacements(numLayers) } func loadWeights(md *loader.ModelData, mod model.Model, dispatcher *device.DeviceDispatcher) error { type namedEntry struct { name string info loader.TensorEntry } ordered := make([]namedEntry, 0, len(md.Metadata.Tensors)) for name, info := range md.Metadata.Tensors { ordered = append(ordered, namedEntry{name: name, info: info}) } sort.Slice(ordered, func(i, j int) bool { return ordered[i].info.Offset < ordered[j].info.Offset }) modelCfg := mod.Config() numLayers := 0 lastOnGPU := false if modelCfg != nil && modelCfg.NumLayers > 0 { numLayers = modelCfg.NumLayers if dispatcher != nil { p := dispatcher.LayerPlacement(numLayers - 1).Normalize() lastOnGPU = p.Type == tensor.CUDA } } hasExplicitLMHead := false for n := range md.Metadata.Tensors { if strings.Contains(strings.ToLower(n), "lm_head") { hasExplicitLMHead = true break } } keepF16 := func(name string, shape tensor.Shape, onGPU bool) bool { if len(shape) != 2 { return false } ln := strings.ToLower(name) if strings.Contains(ln, "conv") || strings.Contains(ln, "a_log") || strings.Contains(ln, "dt_bias") || strings.Contains(ln, "o_norm") { return false } // Embedding is on CPU but we can keep fp16 (nn.Embedding handles it). if strings.Contains(ln, "embed_tokens") { // If lm_head is tied to embeddings and runs on CPU, we must keep F32. return hasExplicitLMHead || lastOnGPU } // Dense matmul weights for GPU layers can stay fp16 to avoid 2x RAM blow-up. return onGPU } for _, ent := range ordered { name := ent.name info := ent.info shape := make(tensor.Shape, len(info.Shape)) for i, dim := range info.Shape { shape[i] = int(dim) } var dt tensor.DType decodeToF32 := false switch info.DType { case loader.F32: dt = tensor.Float32 case loader.F16: dt = tensor.Float16 case loader.BF16: dt = tensor.BFloat16 decodeToF32 = true // keep conservative until BF16 matmul is supported case loader.Q4_K: dt = tensor.Q4_K case loader.Q3_K: dt = tensor.Q3_K case loader.Q5_K: dt = tensor.Q5_K case loader.Q6_K: dt = tensor.Q6_K case loader.Q8_K: dt = tensor.Q8_K case loader.Q2_K: dt = tensor.Q2_K default: dt = tensor.Float32 } onGPU := false if dispatcher != nil && numLayers > 0 { if idx, ok := parseLayerIndex(name); ok && idx >= 0 && idx < numLayers { p := dispatcher.LayerPlacement(idx).Normalize() onGPU = p.Type == tensor.CUDA } else { ln := strings.ToLower(name) if strings.Contains(ln, "lm_head") || strings.Contains(ln, "output") || strings.Contains(ln, "model.norm") { onGPU = lastOnGPU } } } // Keep fp16 weights for GPU matmuls to avoid 2x RAM cost. if dt == tensor.Float16 { decodeToF32 = !keepF16(name, shape, onGPU) } tBytes, err := md.GetTensorData(name) if err != nil { return err } t, err := cpu.NewTensorFromBytes(shape, dt, tBytes) if err != nil { return fmt.Errorf("failed to create tensor %s: %v", name, err) } if decodeToF32 { u16 := t.DataUint16() out := make([]float32, shape.NumElements()) switch dt { case tensor.Float16: for i := range out { out[i] = float16BitsToFloat32(u16[i]) } case tensor.BFloat16: for i := range out { out[i] = bfloat16BitsToFloat32(u16[i]) } } t = cpu.NewTensor(shape, out) } if err := mod.SetTensor(name, t); err != nil { // Unused tensor, ignore } } return nil } func float16BitsToFloat32(bits uint16) float32 { sign := uint32(bits&0x8000) << 16 exp := int32((bits & 0x7C00) >> 10) mant := uint32(bits & 0x03FF) if exp == 0 { if mant == 0 { return math.Float32frombits(sign) } for mant&0x0400 == 0 { mant <<= 1 exp-- } exp++ mant &= 0x03FF } else if exp == 0x1F { if mant == 0 { return math.Float32frombits(sign | 0x7F800000) } return math.Float32frombits(sign | 0x7FC00000) } exp = exp + (127 - 15) return math.Float32frombits(sign | (uint32(exp) << 23) | (mant << 13)) } func bfloat16BitsToFloat32(bits uint16) float32 { return math.Float32frombits(uint32(bits) << 16) } // Legacy compatibility type DeviceConfig struct { LayerDevices []tensor.DevicePlacement } func LoadModel(path string, config DeviceConfig) (*Engine, error) { cfg := DefaultConfig() if len(config.LayerDevices) > 0 { // Count GPU layers gpuCount := 0 for _, p := range config.LayerDevices { if p.Type == tensor.CUDA { gpuCount++ } } cfg.GPULayers = gpuCount } return Load(path, cfg) }