package graph

import "makarna/pkg/tensor"

// ExecutionPlan describes the static computation graph for a request.
// It is intentionally lightweight so the plan can be reused across
// decode steps without rebuilding the structure.
type ExecutionPlan struct {
	// RequestID links the plan to a running session.
	RequestID string
	// MaxContext tokens reserved for this request. The KV cache manager
	// must have already reserved enough blocks to satisfy this budget.
	MaxContext int
	// BlockSize controls how many tokens are packed in each KV block.
	BlockSize int
	// Layers lists per-layer stage information (prefill/decode flags).
	Layers []LayerPlan
}

// LayerPlan captures per-layer execution intent. The current engine only
// needs to distinguish whether a layer participates in decode.
type LayerPlan struct {
	Index          int
	HasAttention   bool
	HasMLP         bool
	SupportsDecode bool
	Device         tensor.DevicePlacement
}

// RequestSpec declares what a caller wants to run. The scheduler converts
// this into an ExecutionPlan and hands it to the runtime.
type RequestSpec struct {
	ID           string
	MaxContext   int
	BlockSize    int
	NumLayers    int
	UseAttention bool
	LayerDevices []tensor.DevicePlacement
}

// BuildPlan produces a minimal ExecutionPlan suitable for single-GPU decode.
// The plan stays constant while the scheduler feeds new token batches.
func BuildPlan(spec RequestSpec) ExecutionPlan {
	plan := ExecutionPlan{
		RequestID:  spec.ID,
		MaxContext: spec.MaxContext,
		BlockSize:  spec.BlockSize,
		Layers:     make([]LayerPlan, spec.NumLayers),
	}
	for i := 0; i < spec.NumLayers; i++ {
		device := tensor.DevicePlacement{Type: tensor.CPU, GPU: -1}
		if i < len(spec.LayerDevices) {
			device = spec.LayerDevices[i].Normalize()
		}
		plan.Layers[i] = LayerPlan{
			Index:          i,
			HasAttention:   spec.UseAttention,
			HasMLP:         true,
			SupportsDecode: true,
			Device:         device,
		}
	}
	return plan
}