cturan
/
makarna
-ын хуулбар https://github.com/cturan/makarna


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
							//go:build cuda

// Package compute provides GPU weight caching for persistent weight storage.
package compute

import (
	"fmt"
	"log"
	"os"
	"sort"
	"strconv"
	"strings"
	"sync"
	"unsafe"

	"makarna/pkg/backend/cpu"
	"makarna/pkg/backend/cuda"
	"makarna/pkg/tensor"
)

// GPUWeightCache stores quantized weights on GPU for reuse across calls.
// This is the key optimization - upload once, use many times.
type GPUWeightCache struct {
	mu       sync.RWMutex
	weights  map[string]cachedWeight
	gpu      int
	totalMem uint64 // bytes allocated

	allocByLayer map[int]uint64
	dupByTensor  map[uintptr]string
	allocCount   uint64
}

type cachedWeight struct {
	ptr       unsafe.Pointer
	tensor    *cuda.Tensor // Keep reference for float weights to prevent GC
	dtype     tensor.DType
	shape     tensor.Shape
	numBlocks int
	sizeBytes int
}

// Global weight cache per GPU
var (
	weightCaches   = make(map[int]*GPUWeightCache)
	weightCacheMu  sync.Mutex
)

// GetWeightCache returns the weight cache for a GPU, creating if needed.
func GetWeightCache(gpu int) *GPUWeightCache {
	weightCacheMu.Lock()
	defer weightCacheMu.Unlock()

	if cache, ok := weightCaches[gpu]; ok {
		return cache
	}

	cache := &GPUWeightCache{
		weights:      make(map[string]cachedWeight),
		gpu:          gpu,
		allocByLayer: make(map[int]uint64),
		dupByTensor:  make(map[uintptr]string),
	}
	weightCaches[gpu] = cache
	return cache
}

// Get returns a cached GPU weight pointer, or nil if not cached.
func (c *GPUWeightCache) Get(key string) (unsafe.Pointer, bool) {
	c.mu.RLock()
	defer c.mu.RUnlock()

	if w, ok := c.weights[key]; ok {
		return w.ptr, true
	}
	return nil, false
}

// GetTensor returns the cached CUDA tensor for float weights (Float32/Float16 on GPU).
// For quantized weights, this returns (nil, false) since they are stored as raw device pointers.
func (c *GPUWeightCache) GetTensor(key string) (*cuda.Tensor, bool) {
	c.mu.RLock()
	defer c.mu.RUnlock()

	w, ok := c.weights[key]
	if !ok {
		return nil, false
	}
	if w.tensor == nil {
		return nil, false
	}
	return w.tensor, true
}

// Upload uploads a CPU tensor to GPU and caches it.
// Returns the GPU pointer for immediate use.
func (c *GPUWeightCache) Upload(key string, t *cpu.Tensor) (unsafe.Pointer, error) {
	c.mu.Lock()
	defer c.mu.Unlock()

	// Check if already cached
	if w, ok := c.weights[key]; ok {
		return w.ptr, nil
	}

	shape := t.Shape()
	dtype := t.DType()
	numElements := shape.NumElements()

	var ptr unsafe.Pointer
	var sizeBytes int
	var numBlocks int
	var err error

	switch dtype {
	case tensor.Float16, tensor.BFloat16:
		sizeBytes = numElements * 2
		gpuTensor, err2 := cuda.NewTensor(shape, dtype, c.gpu)
		if err2 != nil {
			return nil, fmt.Errorf("alloc %v weight: %w", dtype, err2)
		}
		if numElements > 0 {
			srcPtr := t.Data().(unsafe.Pointer)
			dstPtr := gpuTensor.Data().(unsafe.Pointer)
			if err2 := cuda.MemcpyH2D(dstPtr, srcPtr, uintptr(sizeBytes), c.gpu); err2 != nil {
				gpuTensor.Free()
				return nil, fmt.Errorf("copy %v weight: %w", dtype, err2)
			}
		}
		ptr = gpuTensor.Data().(unsafe.Pointer)
		// Store tensor reference to prevent GC.
		c.weights[key] = cachedWeight{
			ptr:       ptr,
			tensor:    gpuTensor,
			dtype:     dtype,
			shape:     shape,
			numBlocks: 0,
			sizeBytes: sizeBytes,
		}
		c.totalMem += uint64(sizeBytes)
		c.recordAlloc(key, sizeBytes, dtype, shape, t)
		return ptr, nil

	case tensor.Q8_K:
		numBlocks = numElements / 256
		sizeBytes = numBlocks * 292
		data := t.Data().(unsafe.Pointer)
		dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
		ptr, err = cuda.UploadQ8K(dataSlice, numBlocks, c.gpu)

	case tensor.Q5_K:
		numBlocks = numElements / 256
		sizeBytes = numBlocks * 176
		data := t.Data().(unsafe.Pointer)
		dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
		ptr, err = cuda.UploadQ5K(dataSlice, numBlocks, c.gpu)

	case tensor.Q4_K:
		numBlocks = numElements / 256
		sizeBytes = numBlocks * 144
		data := t.Data().(unsafe.Pointer)
		dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
		ptr, err = cuda.UploadQ4K(dataSlice, numBlocks, c.gpu)

	case tensor.Q2_K:
		numBlocks = numElements / 256
		sizeBytes = numBlocks * 84 // 16 (scales) + 64 (qs) + 2 (d) + 2 (dmin)
		data := t.Data().(unsafe.Pointer)
		dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
		ptr, err = cuda.UploadQ2K(dataSlice, numBlocks, c.gpu)

	case tensor.Q3_K:
		numBlocks = numElements / 256
		sizeBytes = numBlocks * 110 // 32(hm) + 64(qs) + 12(scales) + 2(d)
		data := t.Data().(unsafe.Pointer)
		dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
		ptr, err = cuda.UploadQ3K(dataSlice, numBlocks, c.gpu)

	case tensor.Q6_K:
		numBlocks = numElements / 256
		sizeBytes = numBlocks * 210 // 128(ql) + 64(qh) + 16(scales) + 2(d)
		data := t.Data().(unsafe.Pointer)
		dataSlice := unsafe.Slice((*byte)(data), sizeBytes)
		ptr, err = cuda.UploadQ6K(dataSlice, numBlocks, c.gpu)

	case tensor.Float32:
		sizeBytes = numElements * 4
		gpuTensor, err2 := cuda.NewTensor(shape, tensor.Float32, c.gpu)
		if err2 != nil {
			return nil, fmt.Errorf("alloc F32 weight: %w", err2)
		}
		if err2 := gpuTensor.CopyFrom(t.DataFloat32()); err2 != nil {
			return nil, fmt.Errorf("copy F32 weight: %w", err2)
		}
		ptr = gpuTensor.Data().(unsafe.Pointer)
		// Store tensor reference to prevent GC.
		c.weights[key] = cachedWeight{
			ptr:       ptr,
			tensor:    gpuTensor,
			dtype:     dtype,
			shape:     shape,
			numBlocks: 0,
			sizeBytes: sizeBytes,
		}
		c.totalMem += uint64(sizeBytes)
		c.recordAlloc(key, sizeBytes, dtype, shape, t)
		return ptr, nil

	default:
		return nil, fmt.Errorf("unsupported dtype for GPU cache: %v", dtype)
	}

	if err != nil {
		return nil, err
	}

	c.weights[key] = cachedWeight{
		ptr:       ptr,
		tensor:    nil, // Quant weights store raw device pointers.
		dtype:     dtype,
		shape:     shape,
		numBlocks: numBlocks,
		sizeBytes: sizeBytes,
	}
	c.totalMem += uint64(sizeBytes)
	c.recordAlloc(key, sizeBytes, dtype, shape, t)

	return ptr, nil
}

// UploadF16 uploads a Float32 CPU tensor to GPU as Float16 and caches it.
// Intended for Tensor Core GEMM paths (e.g., dense matmul weights).
func (c *GPUWeightCache) UploadF16(key string, t *cpu.Tensor) (unsafe.Pointer, error) {
	c.mu.Lock()
	defer c.mu.Unlock()

	// Check if already cached
	if w, ok := c.weights[key]; ok {
		return w.ptr, nil
	}

	if t.DType() != tensor.Float32 {
		return nil, fmt.Errorf("UploadF16: expected Float32 tensor, got %v", t.DType())
	}

	shape := t.Shape()
	numElements := shape.NumElements()
	sizeBytes := numElements * 2

	tmpF32, err := cuda.NewTensor(shape, tensor.Float32, c.gpu)
	if err != nil {
		return nil, fmt.Errorf("alloc temp F32 weight: %w", err)
	}
	if err := tmpF32.CopyFrom(t.DataFloat32()); err != nil {
		tmpF32.Free()
		return nil, fmt.Errorf("copy temp F32 weight: %w", err)
	}

	gpuTensor, err := cuda.NewTensor(shape, tensor.Float16, c.gpu)
	if err != nil {
		tmpF32.Free()
		return nil, fmt.Errorf("alloc F16 weight: %w", err)
	}
	if err := cuda.CastF32ToF16(tmpF32.Data().(unsafe.Pointer), gpuTensor.Data().(unsafe.Pointer), numElements, c.gpu); err != nil {
		tmpF32.Free()
		gpuTensor.Free()
		return nil, fmt.Errorf("cast weight F32->F16: %w", err)
	}
	tmpF32.Free()

	ptr := gpuTensor.Data().(unsafe.Pointer)
	c.weights[key] = cachedWeight{
		ptr:       ptr,
		tensor:    gpuTensor,
		dtype:     tensor.Float16,
		shape:     shape,
		numBlocks: 0,
		sizeBytes: sizeBytes,
	}
	c.totalMem += uint64(sizeBytes)
	c.recordAlloc(key, sizeBytes, tensor.Float16, shape, t)

	return ptr, nil
}

// TotalMemory returns total GPU memory used by cache in bytes.
func (c *GPUWeightCache) TotalMemory() uint64 {
	c.mu.RLock()
	defer c.mu.RUnlock()
	return c.totalMem
}

// Clear frees all cached weights.
func (c *GPUWeightCache) Clear() {
	c.mu.Lock()
	defer c.mu.Unlock()

	for _, w := range c.weights {
		if w.tensor != nil {
			// Float weights: tensor will be freed by finalizer
			// Just clear the reference
			w.tensor = nil
		} else {
			// Quantized weights: free the raw pointer
			cuda.FreeDevicePtr(w.ptr)
		}
	}
	c.weights = make(map[string]cachedWeight)
	c.totalMem = 0
	c.allocByLayer = make(map[int]uint64)
	c.dupByTensor = make(map[uintptr]string)
	c.allocCount = 0
}

// ClearAllCaches frees all GPU weight caches.
func ClearAllCaches() {
	weightCacheMu.Lock()
	defer weightCacheMu.Unlock()

	for _, cache := range weightCaches {
		cache.Clear()
	}
	weightCaches = make(map[int]*GPUWeightCache)
}

// LogWeightCacheSummary prints per-GPU and per-layer allocation summaries when enabled.
func LogWeightCacheSummary() {
	if !weightMemLogSummaryEnabled() {
		return
	}
	weightCacheMu.Lock()
	caches := make([]*GPUWeightCache, 0, len(weightCaches))
	for _, cache := range weightCaches {
		caches = append(caches, cache)
	}
	weightCacheMu.Unlock()
	for _, cache := range caches {
		cache.dumpSummary()
	}
}

func (c *GPUWeightCache) recordAlloc(key string, sizeBytes int, dtype tensor.DType, shape tensor.Shape, t *cpu.Tensor) {
	if sizeBytes <= 0 {
		return
	}
	layer, ok := layerFromCacheKey(key)
	if !ok {
		layer = -1
	}
	if c.allocByLayer == nil {
		c.allocByLayer = make(map[int]uint64)
	}
	c.allocByLayer[layer] += uint64(sizeBytes)
	c.allocCount++

	if weightMemLogAllocEnabled() {
		log.Printf("gpu-cache alloc gpu=%d layer=%d bytes=%d total=%s key=%s dtype=%s shape=%v",
			c.gpu, layer, sizeBytes, formatBytes(c.totalMem), key, dtype.String(), shape)
		if t != nil {
			if c.dupByTensor == nil {
				c.dupByTensor = make(map[uintptr]string)
			}
			tID := uintptr(unsafe.Pointer(t))
			if prev, ok := c.dupByTensor[tID]; ok && prev != key {
				log.Printf("gpu-cache dup gpu=%d tensor=%p prev_key=%s new_key=%s", c.gpu, t, prev, key)
			} else if !ok {
				c.dupByTensor[tID] = key
			}
		}
	}
}

func (c *GPUWeightCache) dumpSummary() {
	if c == nil {
		return
	}
	c.mu.RLock()
	total := c.totalMem
	allocCount := c.allocCount
	byLayer := make([]layerAlloc, 0, len(c.allocByLayer))
	for layer, bytes := range c.allocByLayer {
		byLayer = append(byLayer, layerAlloc{layer: layer, bytes: bytes})
	}
	c.mu.RUnlock()

	sort.Slice(byLayer, func(i, j int) bool {
		return byLayer[i].layer < byLayer[j].layer
	})

	totalMem, freeMem, err := cuda.MemoryInfoDevice(c.gpu)
	if err != nil {
		log.Printf("gpu-cache summary gpu=%d total=%s allocs=%d", c.gpu, formatBytes(total), allocCount)
	} else {
		log.Printf("gpu-cache summary gpu=%d total=%s allocs=%d free=%s/%s", c.gpu, formatBytes(total), allocCount, formatBytes(freeMem), formatBytes(totalMem))
	}

	for _, entry := range byLayer {
		label := "shared"
		if entry.layer >= 0 {
			label = fmt.Sprintf("layer%d", entry.layer)
		}
		log.Printf("gpu-cache layer=%s bytes=%s", label, formatBytes(entry.bytes))
	}
}

type layerAlloc struct {
	layer int
	bytes uint64
}

var (
	weightMemLogOnce    sync.Once
	weightMemLogAlloc   bool
	weightMemLogSummary bool
)

func weightMemLogAllocEnabled() bool {
	weightMemLogInit()
	return weightMemLogAlloc
}

func weightMemLogSummaryEnabled() bool {
	weightMemLogInit()
	return weightMemLogSummary
}

func weightMemLogInit() {
	weightMemLogOnce.Do(func() {
		raw := strings.ToLower(strings.TrimSpace(os.Getenv("MAKARNA_GPU_MEMLOG")))
		if raw == "" || raw == "0" || raw == "false" || raw == "off" {
			return
		}
		switch raw {
		case "1", "true", "all":
			weightMemLogAlloc = true
			weightMemLogSummary = true
			return
		}
		if strings.Contains(raw, "alloc") {
			weightMemLogAlloc = true
		}
		if strings.Contains(raw, "summary") {
			weightMemLogSummary = true
		}
		if !weightMemLogAlloc && !weightMemLogSummary {
			weightMemLogAlloc = true
		}
	})
}

func layerFromCacheKey(key string) (int, bool) {
	if strings.HasPrefix(key, "layer") {
		rest := key[len("layer"):]
		n := readLeadingInt(rest)
		if n >= 0 {
			return n, true
		}
	}
	if strings.HasPrefix(key, "kda_l") {
		rest := key[len("kda_l"):]
		n := readLeadingInt(rest)
		if n >= 0 {
			return n, true
		}
	}
	return 0, false
}

func readLeadingInt(s string) int {
	if s == "" {
		return -1
	}
	end := 0
	for end < len(s) && s[end] >= '0' && s[end] <= '9' {
		end++
	}
	if end == 0 {
		return -1
	}
	n, err := strconv.Atoi(s[:end])
	if err != nil {
		return -1
	}
	return n
}

func formatBytes(v uint64) string {
	const unit = 1024
	if v < unit {
		return fmt.Sprintf("%dB", v)
	}
	div, exp := uint64(unit), 0
	for n := v / unit; n >= unit && exp < 4; n /= unit {
		div *= unit
		exp++
	}
	value := float64(v) / float64(div)
	suffix := []string{"KB", "MB", "GB", "TB", "PB"}[exp]
	return fmt.Sprintf("%.2f%s", value, suffix)
}