| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- // Package kvcache defines the unified KV cache interface used by models.
- package kvcache
- import "makarna/pkg/tensor"
- type View struct {
- K tensor.Tensor
- V tensor.Tensor
- Start int
- Length int
- Device tensor.DeviceType
- GPU int
- }
- // PackedView exposes a head-major packed view of KV data for CPU attention.
- // Layout for K/V:
- // [kvHead][tokenWithinBlock][headDim]
- // Stored contiguously as a flat slice.
- // This is an optional fast-path and is not part of KVCacheInterface.
- type PackedView struct {
- K []float32
- V []float32
- Start int // inclusive
- Length int
- BlockSize int
- HeadDim int
- NumKVHeads int
- }
- // PackedViewsProvider is an optional interface implemented by caches that can
- // expose head-major packed KV views for fast CPU attention kernels.
- type PackedViewsProvider interface {
- ViewsPacked(layer int) []PackedView
- }
- // KVCacheInterface is the unified interface for KV cache implementations.
- // Both the legacy Cache and new PagedKVCache implement this interface.
- type KVCacheInterface interface {
- // SeqLen returns the current sequence length (number of computed tokens).
- SeqLen() int
-
- // Commit advances the sequence length after processing new tokens.
- Commit(newTokens int)
-
- // Append writes new K/V tokens into the cache for a layer.
- // Returns views of the appended data and the start position.
- Append(layer int, k, v tensor.Tensor) ([]View, int, error)
-
- // ContiguousKV returns a contiguous view of K/V for attention.
- // Returns (k, v, ok, error) where ok indicates if contiguous mode is available.
- ContiguousKV(layer, kvLen, kvDim int) (tensor.Tensor, tensor.Tensor, bool, error)
-
- // Views returns the live KV block views for a layer.
- Views(layer int) []View
-
- // IsOnGPU returns true if the cache is on GPU.
- IsOnGPU() bool
-
- // LayerDevice returns the device placement for a layer.
- LayerDevice(layer int) tensor.DevicePlacement
-
- // MaxSeqLen returns the maximum sequence length.
- MaxSeqLen() int
-
- // Truncate rewinds the cache to a specific sequence length.
- Truncate(seqLen int)
-
- // Free releases all resources.
- Free()
- }
- // Verify that Cache and PagedKVCache implement KVCacheInterface
- var _ KVCacheInterface = (*PagedKVCache)(nil)
|