cturan
/
makarna
oglindă de https://github.com/cturan/makarna


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
							package engine

import (
	"context"
	"fmt"
	"log"
	"math"
	"sort"
	"strings"

	"makarna/pkg/backend/cpu"
	"makarna/pkg/backend/device"
	"makarna/pkg/compute"
	"makarna/pkg/loader"
	"makarna/pkg/model"
	_ "makarna/pkg/model/models"
	"makarna/pkg/tensor"
)

// Config configures the inference engine.
type Config struct {
	// GPULayers specifies how many layers to place on GPU (-1 = auto, 0 = CPU only)
	GPULayers int

	// GPUBudget is the fraction of GPU memory to use (0.0-1.0, default 0.9)
	GPUBudget float64

	// GPUDevices lists GPU ordinals to use.
	// In auto mode (GPULayers=-1), nil/empty means "use all visible GPUs".
	// In explicit mode (GPULayers>0), nil/empty means "use all visible GPUs".
	GPUDevices []int

	// UseMmap controls whether the model file is memory-mapped.
	// Default is false (load tensors into RAM).
	UseMmap bool

	// CPUMoE keeps MoE expert weights on CPU to save GPU memory.
	// Expert computations are done by uploading activations to GPU per-expert.
	// Similar to llama.cpp's --moe-cpu flag.
	CPUMoE bool
}

// DefaultConfig returns a sensible default configuration.
func DefaultConfig() Config {
	return Config{
		GPULayers: -1,  // auto
		GPUBudget: 0.9,
		GPUDevices: nil,
		UseMmap:   false,
	}
}

// Engine manages model execution with automatic device placement.
type Engine struct {
	model      model.Model
	loader     *loader.ModelData
	dispatcher *device.DeviceDispatcher
	config     Config
}

// Load loads a model with the given configuration.
// Device placement is determined automatically based on config and available resources.
func Load(path string, cfg Config) (*Engine, error) {
	// 1. Load model file
	md, err := loader.LoadWithOptions(path, loader.LoadOptions{UseMmap: cfg.UseMmap})
	if err != nil {
		return nil, fmt.Errorf("failed to load model file: %w", err)
	}

	// 2. Parse model config
	modelCfg, err := parseModelConfig(md)
	if err != nil {
		md.Close()
		return nil, err
	}

	// 3. Determine device placements
	placements := determinePlacements(md, &md.Metadata.ModelConfig, cfg)
	dispatcher := device.NewDeviceDispatcher(placements)

	// Log placement info
	gpuCount := dispatcher.NumGPULayers()
	cpuCount := len(placements) - gpuCount
	if gpuCount > 0 {
		log.Printf("Device placement: %d layers on GPU, %d layers on CPU", gpuCount, cpuCount)
	} else {
		log.Printf("Device placement: all %d layers on CPU", len(placements))
	}

	// 4. Create model
	mod, err := model.New(modelCfg.Architecture, modelCfg)
	if err != nil {
		md.Close()
		return nil, err
	}

	// 5. Load weights (currently all on CPU, lazy transfer to GPU)
	if err := loadWeights(md, mod, dispatcher); err != nil {
		md.Close()
		return nil, err
	}
	if v, ok := mod.(interface{ Validate() error }); ok {
		if err := v.Validate(); err != nil {
			md.Close()
			return nil, err
		}
	}

	return &Engine{
		model:      mod,
		loader:     md,
		dispatcher: dispatcher,
		config:     cfg,
	}, nil
}

// Forward performs a forward pass through the model.
// Device placement is handled transparently - the engine ensures tensors
// are on the correct device for each layer automatically.
func (e *Engine) Forward(ctx interface{}, input tensor.Tensor, positions tensor.Tensor, kvCache model.KVCache) (tensor.Tensor, error) {
	// Create context with dispatcher for device-aware operations
	var goCtx context.Context
	if ctx != nil {
		if c, ok := ctx.(context.Context); ok {
			goCtx = c
		} else {
			goCtx = context.Background()
		}
	} else {
		goCtx = context.Background()
	}
	goCtx = compute.WithDispatcher(goCtx, e.dispatcher)
	goCtx = compute.WithCPUMoE(goCtx, e.config.CPUMoE)
	return e.model.Forward(goCtx, input, positions, kvCache)
}

func (e *Engine) ForwardBatch(ctx interface{}, input tensor.Tensor, positions tensor.Tensor, kvCaches []model.KVCache) (tensor.Tensor, error) {
	// Create context with dispatcher for device-aware operations
	var goCtx context.Context
	if ctx != nil {
		if c, ok := ctx.(context.Context); ok {
			goCtx = c
		} else {
			goCtx = context.Background()
		}
	} else {
		goCtx = context.Background()
	}
	goCtx = compute.WithDispatcher(goCtx, e.dispatcher)
	goCtx = compute.WithCPUMoE(goCtx, e.config.CPUMoE)

	if m, ok := e.model.(model.BatchForwarder); ok {
		return m.ForwardBatch(goCtx, input, positions, kvCaches)
	}
	return nil, fmt.Errorf("model does not support ForwardBatch")
}

// Model returns the underlying model.
func (e *Engine) Model() model.Model {
	return e.model
}

// Loader returns the model data loader.
func (e *Engine) Loader() *loader.ModelData {
	return e.loader
}

// Dispatcher returns the device dispatcher for layer placement queries.
func (e *Engine) Dispatcher() *device.DeviceDispatcher {
	return e.dispatcher
}

// ComputeContext creates a compute context for a specific layer.
func (e *Engine) ComputeContext(layerIdx int) *compute.Context {
	return compute.NewContext(e.dispatcher, layerIdx)
}

// Close releases all resources.
func (e *Engine) Close() error {
	if e.dispatcher != nil {
		e.dispatcher.Clear()
	}
	if e.loader != nil {
		return e.loader.Close()
	}
	return nil
}

func parseModelConfig(md *loader.ModelData) (*model.Config, error) {
	params := md.Metadata.ModelConfig.Params

	hiddenSize, _ := params["hidden_size"].(float64)
	numLayers, _ := params["num_hidden_layers"].(float64)
	numHeads, _ := params["num_attention_heads"].(float64)
	vocabSize, _ := params["vocab_size"].(float64)
	intermediate, _ := params["intermediate_size"].(float64)

	numKVHeads, _ := params["num_key_value_heads"].(float64)
	if numKVHeads == 0 {
		numKVHeads = numHeads
	}

	headDim, _ := params["head_dim"].(float64)
	if headDim == 0 {
		headDim = hiddenSize / numHeads
	}

	ropeTheta, _ := params["rope_theta"].(float64)
	if ropeTheta == 0 {
		ropeTheta = 10000.0
	}

	rmsNormEps, _ := params["rms_norm_eps"].(float64)
	if rmsNormEps == 0 {
		rmsNormEps = 1e-6
	}

	return &model.Config{
		Architecture: md.Metadata.ModelConfig.Architecture,
		HiddenSize:   int(hiddenSize),
		NumLayers:    int(numLayers),
		NumHeads:     int(numHeads),
		NumKVHeads:   int(numKVHeads),
		VocabSize:    int(vocabSize),
		Intermediate: int(intermediate),
		HeadDim:      int(headDim),
		RopeTheta:    ropeTheta,
		RMSNormEps:   rmsNormEps,
		Params:       params,
	}, nil
}

func determinePlacements(md *loader.ModelData, loaderCfg *loader.ModelConfig, cfg Config) []tensor.DevicePlacement {
	numLayers := int(loaderCfg.Params["num_hidden_layers"].(float64))

	// CPU-only mode
	if cfg.GPULayers == 0 || !device.CUDAAvailable() {
		return makeCPUPlacements(numLayers)
	}

	// Explicit GPU layer count
	if cfg.GPULayers > 0 {
		return makeExplicitPlacementsFitBudget(md, loaderCfg, numLayers, cfg.GPULayers, cfg.GPUBudget, cfg.GPUDevices)
	}

	// Auto mode: use VRAM budget
	return PlanLayerDevices(md, loaderCfg, cfg.GPUBudget, cfg.GPUDevices)
}

func makeExplicitPlacements(numLayers, gpuLayers int, gpuDevices []int) []tensor.DevicePlacement {
	placements := make([]tensor.DevicePlacement, numLayers)

	if gpuLayers > numLayers {
		gpuLayers = numLayers
	}
	if len(gpuDevices) == 0 {
		gpuDevices = normalizedAutoGPUList(nil)
		if len(gpuDevices) == 0 {
			gpuDevices = []int{0}
		}
	}

	for i := range placements {
		placements[i] = tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
	}

	perDev := gpuLayers / len(gpuDevices)
	extra := gpuLayers % len(gpuDevices)

	start := numLayers - gpuLayers
	layer := start
	for di, dev := range gpuDevices {
		take := perDev
		if di < extra {
			take++
		}
		for j := 0; j < take && layer < numLayers; j++ {
			placements[layer] = tensor.DevicePlacement{Type: tensor.CUDA, GPU: dev}
			layer++
		}
	}
	return placements
}

func makeExplicitPlacementsFitBudget(md *loader.ModelData, loaderCfg *loader.ModelConfig, numLayers, gpuLayers int, budgetFraction float64, gpuDevices []int) []tensor.DevicePlacement {
	if gpuLayers <= 0 {
		return makeCPUPlacements(numLayers)
	}
	if budgetFraction <= 0 || budgetFraction > 1 {
		budgetFraction = 0.9
	}

	// If we can't query budgets, keep legacy behavior (may OOM, but avoids breaking CUDA-less builds).
	gpuList := normalizedAutoGPUList(gpuDevices)
	if len(gpuList) == 0 {
		gpuList = []int{0}
	}
	sort.Ints(gpuList)

	budgets := make(map[int]uint64, len(gpuList))
	for _, gpu := range gpuList {
		totalMem, freeMem, err := cudaMemoryInfoDeviceFn(gpu)
		if err != nil {
			continue
		}
		target := uint64(float64(totalMem) * budgetFraction)
		if target > freeMem {
			target = freeMem
		}
		if target <= defaultGPUReserveBytes {
			continue
		}
		target -= defaultGPUReserveBytes
		budgets[gpu] = target
	}
	if len(budgets) == 0 {
		return makeExplicitPlacements(numLayers, gpuLayers, gpuList)
	}

	layerWeights := estimateLayerWeightBytes(md, numLayers)
	layerCache := estimateLayerCacheBytes(loaderCfg, numLayers)
	layerNeed := make([]uint64, numLayers)
	for i := 0; i < numLayers; i++ {
		layerNeed[i] = layerWeights[i] + layerCache[i]
	}

	// Try from requested down to 1 until it fits.
	requested := gpuLayers
	for gpuLayers > 0 {
		placements := makeExplicitPlacements(numLayers, gpuLayers, gpuList)
		used := make(map[int]uint64, len(budgets))
		ok := true
		for layer, p := range placements {
			pp := p.Normalize()
			if pp.Type != tensor.CUDA || pp.GPU < 0 {
				continue
			}
			b, okBudget := budgets[pp.GPU]
			if !okBudget || b == 0 {
				ok = false
				break
			}
			used[pp.GPU] += layerNeed[layer]
			if used[pp.GPU] > b {
				ok = false
				break
			}
		}
		if ok {
			if gpuLayers < requested {
				log.Printf("placement: reducing explicit gpu_layers %d -> %d (budget=%.2f gpus=%v)", requested, gpuLayers, budgetFraction, gpuList)
			}
			return placements
		}
		gpuLayers--
	}

	log.Printf("placement: explicit gpu_layers=%d does not fit, falling back to CPU", requested)
	return makeCPUPlacements(numLayers)
}

func loadWeights(md *loader.ModelData, mod model.Model, dispatcher *device.DeviceDispatcher) error {
	type namedEntry struct {
		name string
		info loader.TensorEntry
	}
	ordered := make([]namedEntry, 0, len(md.Metadata.Tensors))
	for name, info := range md.Metadata.Tensors {
		ordered = append(ordered, namedEntry{name: name, info: info})
	}
	sort.Slice(ordered, func(i, j int) bool {
		return ordered[i].info.Offset < ordered[j].info.Offset
	})

	modelCfg := mod.Config()
	numLayers := 0
	lastOnGPU := false
	if modelCfg != nil && modelCfg.NumLayers > 0 {
		numLayers = modelCfg.NumLayers
		if dispatcher != nil {
			p := dispatcher.LayerPlacement(numLayers - 1).Normalize()
			lastOnGPU = p.Type == tensor.CUDA
		}
	}

	hasExplicitLMHead := false
	for n := range md.Metadata.Tensors {
		if strings.Contains(strings.ToLower(n), "lm_head") {
			hasExplicitLMHead = true
			break
		}
	}

	keepF16 := func(name string, shape tensor.Shape, onGPU bool) bool {
		if len(shape) != 2 {
			return false
		}
		ln := strings.ToLower(name)
		if strings.Contains(ln, "conv") || strings.Contains(ln, "a_log") || strings.Contains(ln, "dt_bias") || strings.Contains(ln, "o_norm") {
			return false
		}
		// Embedding is on CPU but we can keep fp16 (nn.Embedding handles it).
		if strings.Contains(ln, "embed_tokens") {
			// If lm_head is tied to embeddings and runs on CPU, we must keep F32.
			return hasExplicitLMHead || lastOnGPU
		}
		// Dense matmul weights for GPU layers can stay fp16 to avoid 2x RAM blow-up.
		return onGPU
	}

	for _, ent := range ordered {
		name := ent.name
		info := ent.info
		shape := make(tensor.Shape, len(info.Shape))
		for i, dim := range info.Shape {
			shape[i] = int(dim)
		}

		var dt tensor.DType
		decodeToF32 := false
		switch info.DType {
		case loader.F32:
			dt = tensor.Float32
		case loader.F16:
			dt = tensor.Float16
		case loader.BF16:
			dt = tensor.BFloat16
			decodeToF32 = true // keep conservative until BF16 matmul is supported
		case loader.Q4_K:
			dt = tensor.Q4_K
		case loader.Q3_K:
			dt = tensor.Q3_K
		case loader.Q5_K:
			dt = tensor.Q5_K
		case loader.Q6_K:
			dt = tensor.Q6_K
		case loader.Q8_K:
			dt = tensor.Q8_K
		case loader.Q2_K:
			dt = tensor.Q2_K
		default:
			dt = tensor.Float32
		}

		onGPU := false
		if dispatcher != nil && numLayers > 0 {
			if idx, ok := parseLayerIndex(name); ok && idx >= 0 && idx < numLayers {
				p := dispatcher.LayerPlacement(idx).Normalize()
				onGPU = p.Type == tensor.CUDA
			} else {
				ln := strings.ToLower(name)
				if strings.Contains(ln, "lm_head") || strings.Contains(ln, "output") || strings.Contains(ln, "model.norm") {
					onGPU = lastOnGPU
				}
			}
		}

		// Keep fp16 weights for GPU matmuls to avoid 2x RAM cost.
		if dt == tensor.Float16 {
			decodeToF32 = !keepF16(name, shape, onGPU)
		}

		tBytes, err := md.GetTensorData(name)
		if err != nil {
			return err
		}

		t, err := cpu.NewTensorFromBytes(shape, dt, tBytes)
		if err != nil {
			return fmt.Errorf("failed to create tensor %s: %v", name, err)
		}

		if decodeToF32 {
			u16 := t.DataUint16()
			out := make([]float32, shape.NumElements())
			switch dt {
			case tensor.Float16:
				for i := range out {
					out[i] = float16BitsToFloat32(u16[i])
				}
			case tensor.BFloat16:
				for i := range out {
					out[i] = bfloat16BitsToFloat32(u16[i])
				}
			}
			t = cpu.NewTensor(shape, out)
		}

		if err := mod.SetTensor(name, t); err != nil {
			// Unused tensor, ignore
		}
	}
	return nil
}

func float16BitsToFloat32(bits uint16) float32 {
	sign := uint32(bits&0x8000) << 16
	exp := int32((bits & 0x7C00) >> 10)
	mant := uint32(bits & 0x03FF)

	if exp == 0 {
		if mant == 0 {
			return math.Float32frombits(sign)
		}
		for mant&0x0400 == 0 {
			mant <<= 1
			exp--
		}
		exp++
		mant &= 0x03FF
	} else if exp == 0x1F {
		if mant == 0 {
			return math.Float32frombits(sign | 0x7F800000)
		}
		return math.Float32frombits(sign | 0x7FC00000)
	}

	exp = exp + (127 - 15)
	return math.Float32frombits(sign | (uint32(exp) << 23) | (mant << 13))
}

func bfloat16BitsToFloat32(bits uint16) float32 {
	return math.Float32frombits(uint32(bits) << 16)
}

// Legacy compatibility
type DeviceConfig struct {
	LayerDevices []tensor.DevicePlacement
}

func LoadModel(path string, config DeviceConfig) (*Engine, error) {
	cfg := DefaultConfig()
	if len(config.LayerDevices) > 0 {
		// Count GPU layers
		gpuCount := 0
		for _, p := range config.LayerDevices {
			if p.Type == tensor.CUDA {
				gpuCount++
			}
		}
		cfg.GPULayers = gpuCount
	}
	return Load(path, cfg)
}