| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544 |
- package engine
- import (
- "context"
- "fmt"
- "log"
- "math"
- "sort"
- "strings"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/backend/device"
- "makarna/pkg/compute"
- "makarna/pkg/loader"
- "makarna/pkg/model"
- _ "makarna/pkg/model/models"
- "makarna/pkg/tensor"
- )
- // Config configures the inference engine.
- type Config struct {
- // GPULayers specifies how many layers to place on GPU (-1 = auto, 0 = CPU only)
- GPULayers int
- // GPUBudget is the fraction of GPU memory to use (0.0-1.0, default 0.9)
- GPUBudget float64
- // GPUDevices lists GPU ordinals to use.
- // In auto mode (GPULayers=-1), nil/empty means "use all visible GPUs".
- // In explicit mode (GPULayers>0), nil/empty means "use all visible GPUs".
- GPUDevices []int
- // UseMmap controls whether the model file is memory-mapped.
- // Default is false (load tensors into RAM).
- UseMmap bool
- // CPUMoE keeps MoE expert weights on CPU to save GPU memory.
- // Expert computations are done by uploading activations to GPU per-expert.
- // Similar to llama.cpp's --moe-cpu flag.
- CPUMoE bool
- }
- // DefaultConfig returns a sensible default configuration.
- func DefaultConfig() Config {
- return Config{
- GPULayers: -1, // auto
- GPUBudget: 0.9,
- GPUDevices: nil,
- UseMmap: false,
- }
- }
- // Engine manages model execution with automatic device placement.
- type Engine struct {
- model model.Model
- loader *loader.ModelData
- dispatcher *device.DeviceDispatcher
- config Config
- }
- // Load loads a model with the given configuration.
- // Device placement is determined automatically based on config and available resources.
- func Load(path string, cfg Config) (*Engine, error) {
- // 1. Load model file
- md, err := loader.LoadWithOptions(path, loader.LoadOptions{UseMmap: cfg.UseMmap})
- if err != nil {
- return nil, fmt.Errorf("failed to load model file: %w", err)
- }
- // 2. Parse model config
- modelCfg, err := parseModelConfig(md)
- if err != nil {
- md.Close()
- return nil, err
- }
- // 3. Determine device placements
- placements := determinePlacements(md, &md.Metadata.ModelConfig, cfg)
- dispatcher := device.NewDeviceDispatcher(placements)
- // Log placement info
- gpuCount := dispatcher.NumGPULayers()
- cpuCount := len(placements) - gpuCount
- if gpuCount > 0 {
- log.Printf("Device placement: %d layers on GPU, %d layers on CPU", gpuCount, cpuCount)
- } else {
- log.Printf("Device placement: all %d layers on CPU", len(placements))
- }
- // 4. Create model
- mod, err := model.New(modelCfg.Architecture, modelCfg)
- if err != nil {
- md.Close()
- return nil, err
- }
- // 5. Load weights (currently all on CPU, lazy transfer to GPU)
- if err := loadWeights(md, mod, dispatcher); err != nil {
- md.Close()
- return nil, err
- }
- if v, ok := mod.(interface{ Validate() error }); ok {
- if err := v.Validate(); err != nil {
- md.Close()
- return nil, err
- }
- }
- return &Engine{
- model: mod,
- loader: md,
- dispatcher: dispatcher,
- config: cfg,
- }, nil
- }
- // Forward performs a forward pass through the model.
- // Device placement is handled transparently - the engine ensures tensors
- // are on the correct device for each layer automatically.
- func (e *Engine) Forward(ctx interface{}, input tensor.Tensor, positions tensor.Tensor, kvCache model.KVCache) (tensor.Tensor, error) {
- // Create context with dispatcher for device-aware operations
- var goCtx context.Context
- if ctx != nil {
- if c, ok := ctx.(context.Context); ok {
- goCtx = c
- } else {
- goCtx = context.Background()
- }
- } else {
- goCtx = context.Background()
- }
- goCtx = compute.WithDispatcher(goCtx, e.dispatcher)
- goCtx = compute.WithCPUMoE(goCtx, e.config.CPUMoE)
- return e.model.Forward(goCtx, input, positions, kvCache)
- }
- func (e *Engine) ForwardBatch(ctx interface{}, input tensor.Tensor, positions tensor.Tensor, kvCaches []model.KVCache) (tensor.Tensor, error) {
- // Create context with dispatcher for device-aware operations
- var goCtx context.Context
- if ctx != nil {
- if c, ok := ctx.(context.Context); ok {
- goCtx = c
- } else {
- goCtx = context.Background()
- }
- } else {
- goCtx = context.Background()
- }
- goCtx = compute.WithDispatcher(goCtx, e.dispatcher)
- goCtx = compute.WithCPUMoE(goCtx, e.config.CPUMoE)
- if m, ok := e.model.(model.BatchForwarder); ok {
- return m.ForwardBatch(goCtx, input, positions, kvCaches)
- }
- return nil, fmt.Errorf("model does not support ForwardBatch")
- }
- // Model returns the underlying model.
- func (e *Engine) Model() model.Model {
- return e.model
- }
- // Loader returns the model data loader.
- func (e *Engine) Loader() *loader.ModelData {
- return e.loader
- }
- // Dispatcher returns the device dispatcher for layer placement queries.
- func (e *Engine) Dispatcher() *device.DeviceDispatcher {
- return e.dispatcher
- }
- // ComputeContext creates a compute context for a specific layer.
- func (e *Engine) ComputeContext(layerIdx int) *compute.Context {
- return compute.NewContext(e.dispatcher, layerIdx)
- }
- // Close releases all resources.
- func (e *Engine) Close() error {
- if e.dispatcher != nil {
- e.dispatcher.Clear()
- }
- if e.loader != nil {
- return e.loader.Close()
- }
- return nil
- }
- func parseModelConfig(md *loader.ModelData) (*model.Config, error) {
- params := md.Metadata.ModelConfig.Params
- hiddenSize, _ := params["hidden_size"].(float64)
- numLayers, _ := params["num_hidden_layers"].(float64)
- numHeads, _ := params["num_attention_heads"].(float64)
- vocabSize, _ := params["vocab_size"].(float64)
- intermediate, _ := params["intermediate_size"].(float64)
- numKVHeads, _ := params["num_key_value_heads"].(float64)
- if numKVHeads == 0 {
- numKVHeads = numHeads
- }
- headDim, _ := params["head_dim"].(float64)
- if headDim == 0 {
- headDim = hiddenSize / numHeads
- }
- ropeTheta, _ := params["rope_theta"].(float64)
- if ropeTheta == 0 {
- ropeTheta = 10000.0
- }
- rmsNormEps, _ := params["rms_norm_eps"].(float64)
- if rmsNormEps == 0 {
- rmsNormEps = 1e-6
- }
- return &model.Config{
- Architecture: md.Metadata.ModelConfig.Architecture,
- HiddenSize: int(hiddenSize),
- NumLayers: int(numLayers),
- NumHeads: int(numHeads),
- NumKVHeads: int(numKVHeads),
- VocabSize: int(vocabSize),
- Intermediate: int(intermediate),
- HeadDim: int(headDim),
- RopeTheta: ropeTheta,
- RMSNormEps: rmsNormEps,
- Params: params,
- }, nil
- }
- func determinePlacements(md *loader.ModelData, loaderCfg *loader.ModelConfig, cfg Config) []tensor.DevicePlacement {
- numLayers := int(loaderCfg.Params["num_hidden_layers"].(float64))
- // CPU-only mode
- if cfg.GPULayers == 0 || !device.CUDAAvailable() {
- return makeCPUPlacements(numLayers)
- }
- // Explicit GPU layer count
- if cfg.GPULayers > 0 {
- return makeExplicitPlacementsFitBudget(md, loaderCfg, numLayers, cfg.GPULayers, cfg.GPUBudget, cfg.GPUDevices)
- }
- // Auto mode: use VRAM budget
- return PlanLayerDevices(md, loaderCfg, cfg.GPUBudget, cfg.GPUDevices)
- }
- func makeExplicitPlacements(numLayers, gpuLayers int, gpuDevices []int) []tensor.DevicePlacement {
- placements := make([]tensor.DevicePlacement, numLayers)
- if gpuLayers > numLayers {
- gpuLayers = numLayers
- }
- if len(gpuDevices) == 0 {
- gpuDevices = normalizedAutoGPUList(nil)
- if len(gpuDevices) == 0 {
- gpuDevices = []int{0}
- }
- }
- for i := range placements {
- placements[i] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
- }
- perDev := gpuLayers / len(gpuDevices)
- extra := gpuLayers % len(gpuDevices)
- start := numLayers - gpuLayers
- layer := start
- for di, dev := range gpuDevices {
- take := perDev
- if di < extra {
- take++
- }
- for j := 0; j < take && layer < numLayers; j++ {
- placements[layer] = tensor.DevicePlacement{Type: tensor.CUDA, GPU: dev}
- layer++
- }
- }
- return placements
- }
- func makeExplicitPlacementsFitBudget(md *loader.ModelData, loaderCfg *loader.ModelConfig, numLayers, gpuLayers int, budgetFraction float64, gpuDevices []int) []tensor.DevicePlacement {
- if gpuLayers <= 0 {
- return makeCPUPlacements(numLayers)
- }
- if budgetFraction <= 0 || budgetFraction > 1 {
- budgetFraction = 0.9
- }
- // If we can't query budgets, keep legacy behavior (may OOM, but avoids breaking CUDA-less builds).
- gpuList := normalizedAutoGPUList(gpuDevices)
- if len(gpuList) == 0 {
- gpuList = []int{0}
- }
- sort.Ints(gpuList)
- budgets := make(map[int]uint64, len(gpuList))
- for _, gpu := range gpuList {
- totalMem, freeMem, err := cudaMemoryInfoDeviceFn(gpu)
- if err != nil {
- continue
- }
- target := uint64(float64(totalMem) * budgetFraction)
- if target > freeMem {
- target = freeMem
- }
- if target <= defaultGPUReserveBytes {
- continue
- }
- target -= defaultGPUReserveBytes
- budgets[gpu] = target
- }
- if len(budgets) == 0 {
- return makeExplicitPlacements(numLayers, gpuLayers, gpuList)
- }
- layerWeights := estimateLayerWeightBytes(md, numLayers)
- layerCache := estimateLayerCacheBytes(loaderCfg, numLayers)
- layerNeed := make([]uint64, numLayers)
- for i := 0; i < numLayers; i++ {
- layerNeed[i] = layerWeights[i] + layerCache[i]
- }
- // Try from requested down to 1 until it fits.
- requested := gpuLayers
- for gpuLayers > 0 {
- placements := makeExplicitPlacements(numLayers, gpuLayers, gpuList)
- used := make(map[int]uint64, len(budgets))
- ok := true
- for layer, p := range placements {
- pp := p.Normalize()
- if pp.Type != tensor.CUDA || pp.GPU < 0 {
- continue
- }
- b, okBudget := budgets[pp.GPU]
- if !okBudget || b == 0 {
- ok = false
- break
- }
- used[pp.GPU] += layerNeed[layer]
- if used[pp.GPU] > b {
- ok = false
- break
- }
- }
- if ok {
- if gpuLayers < requested {
- log.Printf("placement: reducing explicit gpu_layers %d -> %d (budget=%.2f gpus=%v)", requested, gpuLayers, budgetFraction, gpuList)
- }
- return placements
- }
- gpuLayers--
- }
- log.Printf("placement: explicit gpu_layers=%d does not fit, falling back to CPU", requested)
- return makeCPUPlacements(numLayers)
- }
- func loadWeights(md *loader.ModelData, mod model.Model, dispatcher *device.DeviceDispatcher) error {
- type namedEntry struct {
- name string
- info loader.TensorEntry
- }
- ordered := make([]namedEntry, 0, len(md.Metadata.Tensors))
- for name, info := range md.Metadata.Tensors {
- ordered = append(ordered, namedEntry{name: name, info: info})
- }
- sort.Slice(ordered, func(i, j int) bool {
- return ordered[i].info.Offset < ordered[j].info.Offset
- })
- modelCfg := mod.Config()
- numLayers := 0
- lastOnGPU := false
- if modelCfg != nil && modelCfg.NumLayers > 0 {
- numLayers = modelCfg.NumLayers
- if dispatcher != nil {
- p := dispatcher.LayerPlacement(numLayers - 1).Normalize()
- lastOnGPU = p.Type == tensor.CUDA
- }
- }
- hasExplicitLMHead := false
- for n := range md.Metadata.Tensors {
- if strings.Contains(strings.ToLower(n), "lm_head") {
- hasExplicitLMHead = true
- break
- }
- }
- keepF16 := func(name string, shape tensor.Shape, onGPU bool) bool {
- if len(shape) != 2 {
- return false
- }
- ln := strings.ToLower(name)
- if strings.Contains(ln, "conv") || strings.Contains(ln, "a_log") || strings.Contains(ln, "dt_bias") || strings.Contains(ln, "o_norm") {
- return false
- }
- // Embedding is on CPU but we can keep fp16 (nn.Embedding handles it).
- if strings.Contains(ln, "embed_tokens") {
- // If lm_head is tied to embeddings and runs on CPU, we must keep F32.
- return hasExplicitLMHead || lastOnGPU
- }
- // Dense matmul weights for GPU layers can stay fp16 to avoid 2x RAM blow-up.
- return onGPU
- }
- for _, ent := range ordered {
- name := ent.name
- info := ent.info
- shape := make(tensor.Shape, len(info.Shape))
- for i, dim := range info.Shape {
- shape[i] = int(dim)
- }
- var dt tensor.DType
- decodeToF32 := false
- switch info.DType {
- case loader.F32:
- dt = tensor.Float32
- case loader.F16:
- dt = tensor.Float16
- case loader.BF16:
- dt = tensor.BFloat16
- decodeToF32 = true // keep conservative until BF16 matmul is supported
- case loader.Q4_K:
- dt = tensor.Q4_K
- case loader.Q3_K:
- dt = tensor.Q3_K
- case loader.Q5_K:
- dt = tensor.Q5_K
- case loader.Q6_K:
- dt = tensor.Q6_K
- case loader.Q8_K:
- dt = tensor.Q8_K
- case loader.Q2_K:
- dt = tensor.Q2_K
- default:
- dt = tensor.Float32
- }
- onGPU := false
- if dispatcher != nil && numLayers > 0 {
- if idx, ok := parseLayerIndex(name); ok && idx >= 0 && idx < numLayers {
- p := dispatcher.LayerPlacement(idx).Normalize()
- onGPU = p.Type == tensor.CUDA
- } else {
- ln := strings.ToLower(name)
- if strings.Contains(ln, "lm_head") || strings.Contains(ln, "output") || strings.Contains(ln, "model.norm") {
- onGPU = lastOnGPU
- }
- }
- }
- // Keep fp16 weights for GPU matmuls to avoid 2x RAM cost.
- if dt == tensor.Float16 {
- decodeToF32 = !keepF16(name, shape, onGPU)
- }
- tBytes, err := md.GetTensorData(name)
- if err != nil {
- return err
- }
- t, err := cpu.NewTensorFromBytes(shape, dt, tBytes)
- if err != nil {
- return fmt.Errorf("failed to create tensor %s: %v", name, err)
- }
- if decodeToF32 {
- u16 := t.DataUint16()
- out := make([]float32, shape.NumElements())
- switch dt {
- case tensor.Float16:
- for i := range out {
- out[i] = float16BitsToFloat32(u16[i])
- }
- case tensor.BFloat16:
- for i := range out {
- out[i] = bfloat16BitsToFloat32(u16[i])
- }
- }
- t = cpu.NewTensor(shape, out)
- }
- if err := mod.SetTensor(name, t); err != nil {
- // Unused tensor, ignore
- }
- }
- return nil
- }
- func float16BitsToFloat32(bits uint16) float32 {
- sign := uint32(bits&0x8000) << 16
- exp := int32((bits & 0x7C00) >> 10)
- mant := uint32(bits & 0x03FF)
- if exp == 0 {
- if mant == 0 {
- return math.Float32frombits(sign)
- }
- for mant&0x0400 == 0 {
- mant <<= 1
- exp--
- }
- exp++
- mant &= 0x03FF
- } else if exp == 0x1F {
- if mant == 0 {
- return math.Float32frombits(sign | 0x7F800000)
- }
- return math.Float32frombits(sign | 0x7FC00000)
- }
- exp = exp + (127 - 15)
- return math.Float32frombits(sign | (uint32(exp) << 23) | (mant << 13))
- }
- func bfloat16BitsToFloat32(bits uint16) float32 {
- return math.Float32frombits(uint32(bits) << 16)
- }
- // Legacy compatibility
- type DeviceConfig struct {
- LayerDevices []tensor.DevicePlacement
- }
- func LoadModel(path string, config DeviceConfig) (*Engine, error) {
- cfg := DefaultConfig()
- if len(config.LayerDevices) > 0 {
- // Count GPU layers
- gpuCount := 0
- for _, p := range config.LayerDevices {
- if p.Type == tensor.CUDA {
- gpuCount++
- }
- }
- cfg.GPULayers = gpuCount
- }
- return Load(path, cfg)
- }
|