1
0

interface.go 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. // Package kvcache defines the unified KV cache interface used by models.
  2. package kvcache
  3. import "makarna/pkg/tensor"
  4. type View struct {
  5. K tensor.Tensor
  6. V tensor.Tensor
  7. Start int
  8. Length int
  9. Device tensor.DeviceType
  10. GPU int
  11. }
  12. // PackedView exposes a head-major packed view of KV data for CPU attention.
  13. // Layout for K/V:
  14. // [kvHead][tokenWithinBlock][headDim]
  15. // Stored contiguously as a flat slice.
  16. // This is an optional fast-path and is not part of KVCacheInterface.
  17. type PackedView struct {
  18. K []float32
  19. V []float32
  20. Start int // inclusive
  21. Length int
  22. BlockSize int
  23. HeadDim int
  24. NumKVHeads int
  25. }
  26. // PackedViewsProvider is an optional interface implemented by caches that can
  27. // expose head-major packed KV views for fast CPU attention kernels.
  28. type PackedViewsProvider interface {
  29. ViewsPacked(layer int) []PackedView
  30. }
  31. // KVCacheInterface is the unified interface for KV cache implementations.
  32. // Both the legacy Cache and new PagedKVCache implement this interface.
  33. type KVCacheInterface interface {
  34. // SeqLen returns the current sequence length (number of computed tokens).
  35. SeqLen() int
  36. // Commit advances the sequence length after processing new tokens.
  37. Commit(newTokens int)
  38. // Append writes new K/V tokens into the cache for a layer.
  39. // Returns views of the appended data and the start position.
  40. Append(layer int, k, v tensor.Tensor) ([]View, int, error)
  41. // ContiguousKV returns a contiguous view of K/V for attention.
  42. // Returns (k, v, ok, error) where ok indicates if contiguous mode is available.
  43. ContiguousKV(layer, kvLen, kvDim int) (tensor.Tensor, tensor.Tensor, bool, error)
  44. // Views returns the live KV block views for a layer.
  45. Views(layer int) []View
  46. // IsOnGPU returns true if the cache is on GPU.
  47. IsOnGPU() bool
  48. // LayerDevice returns the device placement for a layer.
  49. LayerDevice(layer int) tensor.DevicePlacement
  50. // MaxSeqLen returns the maximum sequence length.
  51. MaxSeqLen() int
  52. // Truncate rewinds the cache to a specific sequence length.
  53. Truncate(seqLen int)
  54. // Free releases all resources.
  55. Free()
  56. }
  57. // Verify that Cache and PagedKVCache implement KVCacheInterface
  58. var _ KVCacheInterface = (*PagedKVCache)(nil)