| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500 |
- //go:build cuda
- // Package compute provides GPU weight caching for persistent weight storage.
- package compute
- import (
- "fmt"
- "log"
- "os"
- "sort"
- "strconv"
- "strings"
- "sync"
- "unsafe"
- "makarna/pkg/backend/cpu"
- "makarna/pkg/backend/cuda"
- "makarna/pkg/tensor"
- )
- // GPUWeightCache stores quantized weights on GPU for reuse across calls.
- // This is the key optimization - upload once, use many times.
- type GPUWeightCache struct {
- mu sync.RWMutex
- weights map[string]cachedWeight
- gpu int
- totalMem uint64 // bytes allocated
- allocByLayer map[int]uint64
- dupByTensor map[uintptr]string
- allocCount uint64
- }
- type cachedWeight struct {
- ptr unsafe.Pointer
- tensor *cuda.Tensor // Keep reference for float weights to prevent GC
- dtype tensor.DType
- shape tensor.Shape
- numBlocks int
- sizeBytes int
- }
- // Global weight cache per GPU
- var (
- weightCaches = make(map[int]*GPUWeightCache)
- weightCacheMu sync.Mutex
- )
- // GetWeightCache returns the weight cache for a GPU, creating if needed.
- func GetWeightCache(gpu int) *GPUWeightCache {
- weightCacheMu.Lock()
- defer weightCacheMu.Unlock()
- if cache, ok := weightCaches[gpu]; ok {
- return cache
- }
- cache := &GPUWeightCache{
- weights: make(map[string]cachedWeight),
- gpu: gpu,
- allocByLayer: make(map[int]uint64),
- dupByTensor: make(map[uintptr]string),
- }
- weightCaches[gpu] = cache
- return cache
- }
- // Get returns a cached GPU weight pointer, or nil if not cached.
- func (c *GPUWeightCache) Get(key string) (unsafe.Pointer, bool) {
- c.mu.RLock()
- defer c.mu.RUnlock()
- if w, ok := c.weights[key]; ok {
- return w.ptr, true
- }
- return nil, false
- }
- // GetTensor returns the cached CUDA tensor for float weights (Float32/Float16 on GPU).
- // For quantized weights, this returns (nil, false) since they are stored as raw device pointers.
- func (c *GPUWeightCache) GetTensor(key string) (*cuda.Tensor, bool) {
- c.mu.RLock()
- defer c.mu.RUnlock()
- w, ok := c.weights[key]
- if !ok {
- return nil, false
- }
- if w.tensor == nil {
- return nil, false
- }
- return w.tensor, true
- }
- // Upload uploads a CPU tensor to GPU and caches it.
- // Returns the GPU pointer for immediate use.
- func (c *GPUWeightCache) Upload(key string, t *cpu.Tensor) (unsafe.Pointer, error) {
- c.mu.Lock()
- defer c.mu.Unlock()
- // Check if already cached
- if w, ok := c.weights[key]; ok {
- return w.ptr, nil
- }
- shape := t.Shape()
- dtype := t.DType()
- numElements := shape.NumElements()
- var ptr unsafe.Pointer
- var sizeBytes int
- var numBlocks int
- var err error
- switch dtype {
- case tensor.Float16, tensor.BFloat16:
- sizeBytes = numElements * 2
- gpuTensor, err2 := cuda.NewTensor(shape, dtype, c.gpu)
- if err2 != nil {
- return nil, fmt.Errorf("alloc %v weight: %w", dtype, err2)
- }
- if numElements > 0 {
- srcPtr := t.Data().(unsafe.Pointer)
- dstPtr := gpuTensor.Data().(unsafe.Pointer)
- if err2 := cuda.MemcpyH2D(dstPtr, srcPtr, uintptr(sizeBytes), c.gpu); err2 != nil {
- gpuTensor.Free()
- return nil, fmt.Errorf("copy %v weight: %w", dtype, err2)
- }
- }
- ptr = gpuTensor.Data().(unsafe.Pointer)
- // Store tensor reference to prevent GC.
- c.weights[key] = cachedWeight{
- ptr: ptr,
- tensor: gpuTensor,
- dtype: dtype,
- shape: shape,
- numBlocks: 0,
- sizeBytes: sizeBytes,
- }
- c.totalMem += uint64(sizeBytes)
- c.recordAlloc(key, sizeBytes, dtype, shape, t)
- return ptr, nil
- case tensor.Q8_K:
- numBlocks = numElements / 256
- sizeBytes = numBlocks * 292
- data := t.Data().(unsafe.Pointer)
- dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
- ptr, err = cuda.UploadQ8K(dataSlice, numBlocks, c.gpu)
- case tensor.Q5_K:
- numBlocks = numElements / 256
- sizeBytes = numBlocks * 176
- data := t.Data().(unsafe.Pointer)
- dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
- ptr, err = cuda.UploadQ5K(dataSlice, numBlocks, c.gpu)
- case tensor.Q4_K:
- numBlocks = numElements / 256
- sizeBytes = numBlocks * 144
- data := t.Data().(unsafe.Pointer)
- dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
- ptr, err = cuda.UploadQ4K(dataSlice, numBlocks, c.gpu)
- case tensor.Q2_K:
- numBlocks = numElements / 256
- sizeBytes = numBlocks * 84 // 16 (scales) + 64 (qs) + 2 (d) + 2 (dmin)
- data := t.Data().(unsafe.Pointer)
- dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
- ptr, err = cuda.UploadQ2K(dataSlice, numBlocks, c.gpu)
- case tensor.Q3_K:
- numBlocks = numElements / 256
- sizeBytes = numBlocks * 110 // 32(hm) + 64(qs) + 12(scales) + 2(d)
- data := t.Data().(unsafe.Pointer)
- dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
- ptr, err = cuda.UploadQ3K(dataSlice, numBlocks, c.gpu)
- case tensor.Q6_K:
- numBlocks = numElements / 256
- sizeBytes = numBlocks * 210 // 128(ql) + 64(qh) + 16(scales) + 2(d)
- data := t.Data().(unsafe.Pointer)
- dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
- ptr, err = cuda.UploadQ6K(dataSlice, numBlocks, c.gpu)
- case tensor.Float32:
- sizeBytes = numElements * 4
- gpuTensor, err2 := cuda.NewTensor(shape, tensor.Float32, c.gpu)
- if err2 != nil {
- return nil, fmt.Errorf("alloc F32 weight: %w", err2)
- }
- if err2 := gpuTensor.CopyFrom(t.DataFloat32()); err2 != nil {
- return nil, fmt.Errorf("copy F32 weight: %w", err2)
- }
- ptr = gpuTensor.Data().(unsafe.Pointer)
- // Store tensor reference to prevent GC.
- c.weights[key] = cachedWeight{
- ptr: ptr,
- tensor: gpuTensor,
- dtype: dtype,
- shape: shape,
- numBlocks: 0,
- sizeBytes: sizeBytes,
- }
- c.totalMem += uint64(sizeBytes)
- c.recordAlloc(key, sizeBytes, dtype, shape, t)
- return ptr, nil
- default:
- return nil, fmt.Errorf("unsupported dtype for GPU cache: %v", dtype)
- }
- if err != nil {
- return nil, err
- }
- c.weights[key] = cachedWeight{
- ptr: ptr,
- tensor: nil, // Quant weights store raw device pointers.
- dtype: dtype,
- shape: shape,
- numBlocks: numBlocks,
- sizeBytes: sizeBytes,
- }
- c.totalMem += uint64(sizeBytes)
- c.recordAlloc(key, sizeBytes, dtype, shape, t)
- return ptr, nil
- }
- // UploadF16 uploads a Float32 CPU tensor to GPU as Float16 and caches it.
- // Intended for Tensor Core GEMM paths (e.g., dense matmul weights).
- func (c *GPUWeightCache) UploadF16(key string, t *cpu.Tensor) (unsafe.Pointer, error) {
- c.mu.Lock()
- defer c.mu.Unlock()
- // Check if already cached
- if w, ok := c.weights[key]; ok {
- return w.ptr, nil
- }
- if t.DType() != tensor.Float32 {
- return nil, fmt.Errorf("UploadF16: expected Float32 tensor, got %v", t.DType())
- }
- shape := t.Shape()
- numElements := shape.NumElements()
- sizeBytes := numElements * 2
- tmpF32, err := cuda.NewTensor(shape, tensor.Float32, c.gpu)
- if err != nil {
- return nil, fmt.Errorf("alloc temp F32 weight: %w", err)
- }
- if err := tmpF32.CopyFrom(t.DataFloat32()); err != nil {
- tmpF32.Free()
- return nil, fmt.Errorf("copy temp F32 weight: %w", err)
- }
- gpuTensor, err := cuda.NewTensor(shape, tensor.Float16, c.gpu)
- if err != nil {
- tmpF32.Free()
- return nil, fmt.Errorf("alloc F16 weight: %w", err)
- }
- if err := cuda.CastF32ToF16(tmpF32.Data().(unsafe.Pointer), gpuTensor.Data().(unsafe.Pointer), numElements, c.gpu); err != nil {
- tmpF32.Free()
- gpuTensor.Free()
- return nil, fmt.Errorf("cast weight F32->F16: %w", err)
- }
- tmpF32.Free()
- ptr := gpuTensor.Data().(unsafe.Pointer)
- c.weights[key] = cachedWeight{
- ptr: ptr,
- tensor: gpuTensor,
- dtype: tensor.Float16,
- shape: shape,
- numBlocks: 0,
- sizeBytes: sizeBytes,
- }
- c.totalMem += uint64(sizeBytes)
- c.recordAlloc(key, sizeBytes, tensor.Float16, shape, t)
- return ptr, nil
- }
- // TotalMemory returns total GPU memory used by cache in bytes.
- func (c *GPUWeightCache) TotalMemory() uint64 {
- c.mu.RLock()
- defer c.mu.RUnlock()
- return c.totalMem
- }
- // Clear frees all cached weights.
- func (c *GPUWeightCache) Clear() {
- c.mu.Lock()
- defer c.mu.Unlock()
- for _, w := range c.weights {
- if w.tensor != nil {
- // Float weights: tensor will be freed by finalizer
- // Just clear the reference
- w.tensor = nil
- } else {
- // Quantized weights: free the raw pointer
- cuda.FreeDevicePtr(w.ptr)
- }
- }
- c.weights = make(map[string]cachedWeight)
- c.totalMem = 0
- c.allocByLayer = make(map[int]uint64)
- c.dupByTensor = make(map[uintptr]string)
- c.allocCount = 0
- }
- // ClearAllCaches frees all GPU weight caches.
- func ClearAllCaches() {
- weightCacheMu.Lock()
- defer weightCacheMu.Unlock()
- for _, cache := range weightCaches {
- cache.Clear()
- }
- weightCaches = make(map[int]*GPUWeightCache)
- }
- // LogWeightCacheSummary prints per-GPU and per-layer allocation summaries when enabled.
- func LogWeightCacheSummary() {
- if !weightMemLogSummaryEnabled() {
- return
- }
- weightCacheMu.Lock()
- caches := make([]*GPUWeightCache, 0, len(weightCaches))
- for _, cache := range weightCaches {
- caches = append(caches, cache)
- }
- weightCacheMu.Unlock()
- for _, cache := range caches {
- cache.dumpSummary()
- }
- }
- func (c *GPUWeightCache) recordAlloc(key string, sizeBytes int, dtype tensor.DType, shape tensor.Shape, t *cpu.Tensor) {
- if sizeBytes <= 0 {
- return
- }
- layer, ok := layerFromCacheKey(key)
- if !ok {
- layer = -1
- }
- if c.allocByLayer == nil {
- c.allocByLayer = make(map[int]uint64)
- }
- c.allocByLayer[layer] += uint64(sizeBytes)
- c.allocCount++
- if weightMemLogAllocEnabled() {
- log.Printf("gpu-cache alloc gpu=%d layer=%d bytes=%d total=%s key=%s dtype=%s shape=%v",
- c.gpu, layer, sizeBytes, formatBytes(c.totalMem), key, dtype.String(), shape)
- if t != nil {
- if c.dupByTensor == nil {
- c.dupByTensor = make(map[uintptr]string)
- }
- tID := uintptr(unsafe.Pointer(t))
- if prev, ok := c.dupByTensor[tID]; ok && prev != key {
- log.Printf("gpu-cache dup gpu=%d tensor=%p prev_key=%s new_key=%s", c.gpu, t, prev, key)
- } else if !ok {
- c.dupByTensor[tID] = key
- }
- }
- }
- }
- func (c *GPUWeightCache) dumpSummary() {
- if c == nil {
- return
- }
- c.mu.RLock()
- total := c.totalMem
- allocCount := c.allocCount
- byLayer := make([]layerAlloc, 0, len(c.allocByLayer))
- for layer, bytes := range c.allocByLayer {
- byLayer = append(byLayer, layerAlloc{layer: layer, bytes: bytes})
- }
- c.mu.RUnlock()
- sort.Slice(byLayer, func(i, j int) bool {
- return byLayer[i].layer < byLayer[j].layer
- })
- totalMem, freeMem, err := cuda.MemoryInfoDevice(c.gpu)
- if err != nil {
- log.Printf("gpu-cache summary gpu=%d total=%s allocs=%d", c.gpu, formatBytes(total), allocCount)
- } else {
- log.Printf("gpu-cache summary gpu=%d total=%s allocs=%d free=%s/%s", c.gpu, formatBytes(total), allocCount, formatBytes(freeMem), formatBytes(totalMem))
- }
- for _, entry := range byLayer {
- label := "shared"
- if entry.layer >= 0 {
- label = fmt.Sprintf("layer%d", entry.layer)
- }
- log.Printf("gpu-cache layer=%s bytes=%s", label, formatBytes(entry.bytes))
- }
- }
- type layerAlloc struct {
- layer int
- bytes uint64
- }
- var (
- weightMemLogOnce sync.Once
- weightMemLogAlloc bool
- weightMemLogSummary bool
- )
- func weightMemLogAllocEnabled() bool {
- weightMemLogInit()
- return weightMemLogAlloc
- }
- func weightMemLogSummaryEnabled() bool {
- weightMemLogInit()
- return weightMemLogSummary
- }
- func weightMemLogInit() {
- weightMemLogOnce.Do(func() {
- raw := strings.ToLower(strings.TrimSpace(os.Getenv("MAKARNA_GPU_MEMLOG")))
- if raw == "" || raw == "0" || raw == "false" || raw == "off" {
- return
- }
- switch raw {
- case "1", "true", "all":
- weightMemLogAlloc = true
- weightMemLogSummary = true
- return
- }
- if strings.Contains(raw, "alloc") {
- weightMemLogAlloc = true
- }
- if strings.Contains(raw, "summary") {
- weightMemLogSummary = true
- }
- if !weightMemLogAlloc && !weightMemLogSummary {
- weightMemLogAlloc = true
- }
- })
- }
- func layerFromCacheKey(key string) (int, bool) {
- if strings.HasPrefix(key, "layer") {
- rest := key[len("layer"):]
- n := readLeadingInt(rest)
- if n >= 0 {
- return n, true
- }
- }
- if strings.HasPrefix(key, "kda_l") {
- rest := key[len("kda_l"):]
- n := readLeadingInt(rest)
- if n >= 0 {
- return n, true
- }
- }
- return 0, false
- }
- func readLeadingInt(s string) int {
- if s == "" {
- return -1
- }
- end := 0
- for end < len(s) && s[end] >= '0' && s[end] <= '9' {
- end++
- }
- if end == 0 {
- return -1
- }
- n, err := strconv.Atoi(s[:end])
- if err != nil {
- return -1
- }
- return n
- }
- func formatBytes(v uint64) string {
- const unit = 1024
- if v < unit {
- return fmt.Sprintf("%dB", v)
- }
- div, exp := uint64(unit), 0
- for n := v / unit; n >= unit && exp < 4; n /= unit {
- div *= unit
- exp++
- }
- value := float64(v) / float64(div)
- suffix := []string{"KB", "MB", "GB", "TB", "PB"}[exp]
- return fmt.Sprintf("%.2f%s", value, suffix)
- }
|