cturan
/
makarna
-ын хуулбар https://github.com/cturan/makarna


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
							// Package kvcache defines the unified KV cache interface used by models.
package kvcache

import "makarna/pkg/tensor"

type View struct {
	K      tensor.Tensor
	V      tensor.Tensor
	Start  int
	Length int
	Device tensor.DeviceType
	GPU    int
}

// PackedView exposes a head-major packed view of KV data for CPU attention.
// Layout for K/V:
//   [kvHead][tokenWithinBlock][headDim]
// Stored contiguously as a flat slice.
// This is an optional fast-path and is not part of KVCacheInterface.
type PackedView struct {
	K         []float32
	V         []float32
	Start     int // inclusive
	Length    int
	BlockSize int
	HeadDim   int
	NumKVHeads int
}

// PackedViewsProvider is an optional interface implemented by caches that can
// expose head-major packed KV views for fast CPU attention kernels.
type PackedViewsProvider interface {
	ViewsPacked(layer int) []PackedView
}

// KVCacheInterface is the unified interface for KV cache implementations.
// Both the legacy Cache and new PagedKVCache implement this interface.
type KVCacheInterface interface {
	// SeqLen returns the current sequence length (number of computed tokens).
	SeqLen() int
	
	// Commit advances the sequence length after processing new tokens.
	Commit(newTokens int)
	
	// Append writes new K/V tokens into the cache for a layer.
	// Returns views of the appended data and the start position.
	Append(layer int, k, v tensor.Tensor) ([]View, int, error)
	
	// ContiguousKV returns a contiguous view of K/V for attention.
	// Returns (k, v, ok, error) where ok indicates if contiguous mode is available.
	ContiguousKV(layer, kvLen, kvDim int) (tensor.Tensor, tensor.Tensor, bool, error)
	
	// Views returns the live KV block views for a layer.
	Views(layer int) []View
	
	// IsOnGPU returns true if the cache is on GPU.
	IsOnGPU() bool
	
	// LayerDevice returns the device placement for a layer.
	LayerDevice(layer int) tensor.DevicePlacement
	
	// MaxSeqLen returns the maximum sequence length.
	MaxSeqLen() int
	
	// Truncate rewinds the cache to a specific sequence length.
	Truncate(seqLen int)
	
	// Free releases all resources.
	Free()
}

// Verify that Cache and PagedKVCache implement KVCacheInterface
var _ KVCacheInterface = (*PagedKVCache)(nil)