package graph import "makarna/pkg/tensor" // ExecutionPlan describes the static computation graph for a request. // It is intentionally lightweight so the plan can be reused across // decode steps without rebuilding the structure. type ExecutionPlan struct { // RequestID links the plan to a running session. RequestID string // MaxContext tokens reserved for this request. The KV cache manager // must have already reserved enough blocks to satisfy this budget. MaxContext int // BlockSize controls how many tokens are packed in each KV block. BlockSize int // Layers lists per-layer stage information (prefill/decode flags). Layers []LayerPlan } // LayerPlan captures per-layer execution intent. The current engine only // needs to distinguish whether a layer participates in decode. type LayerPlan struct { Index int HasAttention bool HasMLP bool SupportsDecode bool Device tensor.DevicePlacement } // RequestSpec declares what a caller wants to run. The scheduler converts // this into an ExecutionPlan and hands it to the runtime. type RequestSpec struct { ID string MaxContext int BlockSize int NumLayers int UseAttention bool LayerDevices []tensor.DevicePlacement } // BuildPlan produces a minimal ExecutionPlan suitable for single-GPU decode. // The plan stays constant while the scheduler feeds new token batches. func BuildPlan(spec RequestSpec) ExecutionPlan { plan := ExecutionPlan{ RequestID: spec.ID, MaxContext: spec.MaxContext, BlockSize: spec.BlockSize, Layers: make([]LayerPlan, spec.NumLayers), } for i := 0; i < spec.NumLayers; i++ { device := tensor.DevicePlacement{Type: tensor.CPU, GPU: -1} if i < len(spec.LayerDevices) { device = spec.LayerDevices[i].Normalize() } plan.Layers[i] = LayerPlan{ Index: i, HasAttention: spec.UseAttention, HasMLP: true, SupportsDecode: true, Device: device, } } return plan }