//go:build cuda // Package compute provides GPU weight caching for persistent weight storage. package compute import ( "fmt" "log" "os" "sort" "strconv" "strings" "sync" "unsafe" "makarna/pkg/backend/cpu" "makarna/pkg/backend/cuda" "makarna/pkg/tensor" ) // GPUWeightCache stores quantized weights on GPU for reuse across calls. // This is the key optimization - upload once, use many times. type GPUWeightCache struct { mu sync.RWMutex weights map[string]cachedWeight gpu int totalMem uint64 // bytes allocated allocByLayer map[int]uint64 dupByTensor map[uintptr]string allocCount uint64 } type cachedWeight struct { ptr unsafe.Pointer tensor *cuda.Tensor // Keep reference for float weights to prevent GC dtype tensor.DType shape tensor.Shape numBlocks int sizeBytes int } // Global weight cache per GPU var ( weightCaches = make(map[int]*GPUWeightCache) weightCacheMu sync.Mutex ) // GetWeightCache returns the weight cache for a GPU, creating if needed. func GetWeightCache(gpu int) *GPUWeightCache { weightCacheMu.Lock() defer weightCacheMu.Unlock() if cache, ok := weightCaches[gpu]; ok { return cache } cache := &GPUWeightCache{ weights: make(map[string]cachedWeight), gpu: gpu, allocByLayer: make(map[int]uint64), dupByTensor: make(map[uintptr]string), } weightCaches[gpu] = cache return cache } // Get returns a cached GPU weight pointer, or nil if not cached. func (c *GPUWeightCache) Get(key string) (unsafe.Pointer, bool) { c.mu.RLock() defer c.mu.RUnlock() if w, ok := c.weights[key]; ok { return w.ptr, true } return nil, false } // GetTensor returns the cached CUDA tensor for float weights (Float32/Float16 on GPU). // For quantized weights, this returns (nil, false) since they are stored as raw device pointers. func (c *GPUWeightCache) GetTensor(key string) (*cuda.Tensor, bool) { c.mu.RLock() defer c.mu.RUnlock() w, ok := c.weights[key] if !ok { return nil, false } if w.tensor == nil { return nil, false } return w.tensor, true } // Upload uploads a CPU tensor to GPU and caches it. // Returns the GPU pointer for immediate use. func (c *GPUWeightCache) Upload(key string, t *cpu.Tensor) (unsafe.Pointer, error) { c.mu.Lock() defer c.mu.Unlock() // Check if already cached if w, ok := c.weights[key]; ok { return w.ptr, nil } shape := t.Shape() dtype := t.DType() numElements := shape.NumElements() var ptr unsafe.Pointer var sizeBytes int var numBlocks int var err error switch dtype { case tensor.Float16, tensor.BFloat16: sizeBytes = numElements * 2 gpuTensor, err2 := cuda.NewTensor(shape, dtype, c.gpu) if err2 != nil { return nil, fmt.Errorf("alloc %v weight: %w", dtype, err2) } if numElements > 0 { srcPtr := t.Data().(unsafe.Pointer) dstPtr := gpuTensor.Data().(unsafe.Pointer) if err2 := cuda.MemcpyH2D(dstPtr, srcPtr, uintptr(sizeBytes), c.gpu); err2 != nil { gpuTensor.Free() return nil, fmt.Errorf("copy %v weight: %w", dtype, err2) } } ptr = gpuTensor.Data().(unsafe.Pointer) // Store tensor reference to prevent GC. c.weights[key] = cachedWeight{ ptr: ptr, tensor: gpuTensor, dtype: dtype, shape: shape, numBlocks: 0, sizeBytes: sizeBytes, } c.totalMem += uint64(sizeBytes) c.recordAlloc(key, sizeBytes, dtype, shape, t) return ptr, nil case tensor.Q8_K: numBlocks = numElements / 256 sizeBytes = numBlocks * 292 data := t.Data().(unsafe.Pointer) dataSlice := unsafe.Slice((*byte)(data), sizeBytes) ptr, err = cuda.UploadQ8K(dataSlice, numBlocks, c.gpu) case tensor.Q5_K: numBlocks = numElements / 256 sizeBytes = numBlocks * 176 data := t.Data().(unsafe.Pointer) dataSlice := unsafe.Slice((*byte)(data), sizeBytes) ptr, err = cuda.UploadQ5K(dataSlice, numBlocks, c.gpu) case tensor.Q4_K: numBlocks = numElements / 256 sizeBytes = numBlocks * 144 data := t.Data().(unsafe.Pointer) dataSlice := unsafe.Slice((*byte)(data), sizeBytes) ptr, err = cuda.UploadQ4K(dataSlice, numBlocks, c.gpu) case tensor.Q2_K: numBlocks = numElements / 256 sizeBytes = numBlocks * 84 // 16 (scales) + 64 (qs) + 2 (d) + 2 (dmin) data := t.Data().(unsafe.Pointer) dataSlice := unsafe.Slice((*byte)(data), sizeBytes) ptr, err = cuda.UploadQ2K(dataSlice, numBlocks, c.gpu) case tensor.Q3_K: numBlocks = numElements / 256 sizeBytes = numBlocks * 110 // 32(hm) + 64(qs) + 12(scales) + 2(d) data := t.Data().(unsafe.Pointer) dataSlice := unsafe.Slice((*byte)(data), sizeBytes) ptr, err = cuda.UploadQ3K(dataSlice, numBlocks, c.gpu) case tensor.Q6_K: numBlocks = numElements / 256 sizeBytes = numBlocks * 210 // 128(ql) + 64(qh) + 16(scales) + 2(d) data := t.Data().(unsafe.Pointer) dataSlice := unsafe.Slice((*byte)(data), sizeBytes) ptr, err = cuda.UploadQ6K(dataSlice, numBlocks, c.gpu) case tensor.Float32: sizeBytes = numElements * 4 gpuTensor, err2 := cuda.NewTensor(shape, tensor.Float32, c.gpu) if err2 != nil { return nil, fmt.Errorf("alloc F32 weight: %w", err2) } if err2 := gpuTensor.CopyFrom(t.DataFloat32()); err2 != nil { return nil, fmt.Errorf("copy F32 weight: %w", err2) } ptr = gpuTensor.Data().(unsafe.Pointer) // Store tensor reference to prevent GC. c.weights[key] = cachedWeight{ ptr: ptr, tensor: gpuTensor, dtype: dtype, shape: shape, numBlocks: 0, sizeBytes: sizeBytes, } c.totalMem += uint64(sizeBytes) c.recordAlloc(key, sizeBytes, dtype, shape, t) return ptr, nil default: return nil, fmt.Errorf("unsupported dtype for GPU cache: %v", dtype) } if err != nil { return nil, err } c.weights[key] = cachedWeight{ ptr: ptr, tensor: nil, // Quant weights store raw device pointers. dtype: dtype, shape: shape, numBlocks: numBlocks, sizeBytes: sizeBytes, } c.totalMem += uint64(sizeBytes) c.recordAlloc(key, sizeBytes, dtype, shape, t) return ptr, nil } // UploadF16 uploads a Float32 CPU tensor to GPU as Float16 and caches it. // Intended for Tensor Core GEMM paths (e.g., dense matmul weights). func (c *GPUWeightCache) UploadF16(key string, t *cpu.Tensor) (unsafe.Pointer, error) { c.mu.Lock() defer c.mu.Unlock() // Check if already cached if w, ok := c.weights[key]; ok { return w.ptr, nil } if t.DType() != tensor.Float32 { return nil, fmt.Errorf("UploadF16: expected Float32 tensor, got %v", t.DType()) } shape := t.Shape() numElements := shape.NumElements() sizeBytes := numElements * 2 tmpF32, err := cuda.NewTensor(shape, tensor.Float32, c.gpu) if err != nil { return nil, fmt.Errorf("alloc temp F32 weight: %w", err) } if err := tmpF32.CopyFrom(t.DataFloat32()); err != nil { tmpF32.Free() return nil, fmt.Errorf("copy temp F32 weight: %w", err) } gpuTensor, err := cuda.NewTensor(shape, tensor.Float16, c.gpu) if err != nil { tmpF32.Free() return nil, fmt.Errorf("alloc F16 weight: %w", err) } if err := cuda.CastF32ToF16(tmpF32.Data().(unsafe.Pointer), gpuTensor.Data().(unsafe.Pointer), numElements, c.gpu); err != nil { tmpF32.Free() gpuTensor.Free() return nil, fmt.Errorf("cast weight F32->F16: %w", err) } tmpF32.Free() ptr := gpuTensor.Data().(unsafe.Pointer) c.weights[key] = cachedWeight{ ptr: ptr, tensor: gpuTensor, dtype: tensor.Float16, shape: shape, numBlocks: 0, sizeBytes: sizeBytes, } c.totalMem += uint64(sizeBytes) c.recordAlloc(key, sizeBytes, tensor.Float16, shape, t) return ptr, nil } // TotalMemory returns total GPU memory used by cache in bytes. func (c *GPUWeightCache) TotalMemory() uint64 { c.mu.RLock() defer c.mu.RUnlock() return c.totalMem } // Clear frees all cached weights. func (c *GPUWeightCache) Clear() { c.mu.Lock() defer c.mu.Unlock() for _, w := range c.weights { if w.tensor != nil { // Float weights: tensor will be freed by finalizer // Just clear the reference w.tensor = nil } else { // Quantized weights: free the raw pointer cuda.FreeDevicePtr(w.ptr) } } c.weights = make(map[string]cachedWeight) c.totalMem = 0 c.allocByLayer = make(map[int]uint64) c.dupByTensor = make(map[uintptr]string) c.allocCount = 0 } // ClearAllCaches frees all GPU weight caches. func ClearAllCaches() { weightCacheMu.Lock() defer weightCacheMu.Unlock() for _, cache := range weightCaches { cache.Clear() } weightCaches = make(map[int]*GPUWeightCache) } // LogWeightCacheSummary prints per-GPU and per-layer allocation summaries when enabled. func LogWeightCacheSummary() { if !weightMemLogSummaryEnabled() { return } weightCacheMu.Lock() caches := make([]*GPUWeightCache, 0, len(weightCaches)) for _, cache := range weightCaches { caches = append(caches, cache) } weightCacheMu.Unlock() for _, cache := range caches { cache.dumpSummary() } } func (c *GPUWeightCache) recordAlloc(key string, sizeBytes int, dtype tensor.DType, shape tensor.Shape, t *cpu.Tensor) { if sizeBytes <= 0 { return } layer, ok := layerFromCacheKey(key) if !ok { layer = -1 } if c.allocByLayer == nil { c.allocByLayer = make(map[int]uint64) } c.allocByLayer[layer] += uint64(sizeBytes) c.allocCount++ if weightMemLogAllocEnabled() { log.Printf("gpu-cache alloc gpu=%d layer=%d bytes=%d total=%s key=%s dtype=%s shape=%v", c.gpu, layer, sizeBytes, formatBytes(c.totalMem), key, dtype.String(), shape) if t != nil { if c.dupByTensor == nil { c.dupByTensor = make(map[uintptr]string) } tID := uintptr(unsafe.Pointer(t)) if prev, ok := c.dupByTensor[tID]; ok && prev != key { log.Printf("gpu-cache dup gpu=%d tensor=%p prev_key=%s new_key=%s", c.gpu, t, prev, key) } else if !ok { c.dupByTensor[tID] = key } } } } func (c *GPUWeightCache) dumpSummary() { if c == nil { return } c.mu.RLock() total := c.totalMem allocCount := c.allocCount byLayer := make([]layerAlloc, 0, len(c.allocByLayer)) for layer, bytes := range c.allocByLayer { byLayer = append(byLayer, layerAlloc{layer: layer, bytes: bytes}) } c.mu.RUnlock() sort.Slice(byLayer, func(i, j int) bool { return byLayer[i].layer < byLayer[j].layer }) totalMem, freeMem, err := cuda.MemoryInfoDevice(c.gpu) if err != nil { log.Printf("gpu-cache summary gpu=%d total=%s allocs=%d", c.gpu, formatBytes(total), allocCount) } else { log.Printf("gpu-cache summary gpu=%d total=%s allocs=%d free=%s/%s", c.gpu, formatBytes(total), allocCount, formatBytes(freeMem), formatBytes(totalMem)) } for _, entry := range byLayer { label := "shared" if entry.layer >= 0 { label = fmt.Sprintf("layer%d", entry.layer) } log.Printf("gpu-cache layer=%s bytes=%s", label, formatBytes(entry.bytes)) } } type layerAlloc struct { layer int bytes uint64 } var ( weightMemLogOnce sync.Once weightMemLogAlloc bool weightMemLogSummary bool ) func weightMemLogAllocEnabled() bool { weightMemLogInit() return weightMemLogAlloc } func weightMemLogSummaryEnabled() bool { weightMemLogInit() return weightMemLogSummary } func weightMemLogInit() { weightMemLogOnce.Do(func() { raw := strings.ToLower(strings.TrimSpace(os.Getenv("MAKARNA_GPU_MEMLOG"))) if raw == "" || raw == "0" || raw == "false" || raw == "off" { return } switch raw { case "1", "true", "all": weightMemLogAlloc = true weightMemLogSummary = true return } if strings.Contains(raw, "alloc") { weightMemLogAlloc = true } if strings.Contains(raw, "summary") { weightMemLogSummary = true } if !weightMemLogAlloc && !weightMemLogSummary { weightMemLogAlloc = true } }) } func layerFromCacheKey(key string) (int, bool) { if strings.HasPrefix(key, "layer") { rest := key[len("layer"):] n := readLeadingInt(rest) if n >= 0 { return n, true } } if strings.HasPrefix(key, "kda_l") { rest := key[len("kda_l"):] n := readLeadingInt(rest) if n >= 0 { return n, true } } return 0, false } func readLeadingInt(s string) int { if s == "" { return -1 } end := 0 for end < len(s) && s[end] >= '0' && s[end] <= '9' { end++ } if end == 0 { return -1 } n, err := strconv.Atoi(s[:end]) if err != nil { return -1 } return n } func formatBytes(v uint64) string { const unit = 1024 if v < unit { return fmt.Sprintf("%dB", v) } div, exp := uint64(unit), 0 for n := v / unit; n >= unit && exp < 4; n /= unit { div *= unit exp++ } value := float64(v) / float64(div) suffix := []string{"KB", "MB", "GB", "TB", "PB"}[exp] return fmt.Sprintf("%.2f%s", value, suffix) }