cturan
/
makarna
mirror of https://github.com/cturan/makarna


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
							#include "cuda_common.cuh"
#include <cstdlib>
#include <cstring>
#include <mutex>
#include <unordered_map>
#include <vector>

// --- Memory ---

 namespace {
 
 struct free_block {
     void * ptr;
     size_t size;
 };
 
 struct device_pool {
     std::mutex mu;
     std::unordered_map<void *, size_t> alloc_sizes;
     std::vector<free_block> free_list;
     size_t cached_bytes = 0;
 };

 // Current CUDA device cached per host thread.
 // This is updated by cuda_set_device and used by cuda_malloc/cuda_free.
 static thread_local int tls_device = 0;
 
 // Keep a small per-device cache of freed allocations to avoid cudaMalloc/cudaFree churn
 // and to keep VRAM usage stable after first-touch allocations.
static device_pool g_pools[16];
static constexpr size_t MAX_FREE_BLOCKS_PER_DEVICE = 1024;
static size_t g_pool_max_cached_bytes = 512ULL << 20;   // 512MB
static size_t g_pool_max_block_bytes = 64ULL << 20;     // 64MB
static bool g_pool_enabled = true;
static std::once_flag g_pool_config_once;

static size_t parse_env_bytes(const char * env, size_t def_val) {
    if (env == nullptr || env[0] == '\0') {
        return def_val;
    }
    char * end = nullptr;
    unsigned long long val = std::strtoull(env, &end, 10);
    if (end != nullptr && *end != '\0') {
        switch (*end) {
        case 'k':
        case 'K':
            val *= 1024ULL;
            break;
        case 'm':
        case 'M':
            val *= 1024ULL * 1024ULL;
            break;
        case 'g':
        case 'G':
            val *= 1024ULL * 1024ULL * 1024ULL;
            break;
        default:
            break;
        }
    }
    return static_cast<size_t>(val);
}

static bool env_true(const char * env) {
    if (env == nullptr) {
        return false;
    }
    if (std::strcmp(env, "1") == 0 || std::strcmp(env, "true") == 0 || std::strcmp(env, "TRUE") == 0) {
        return true;
    }
    return false;
}

static void init_pool_config() {
    std::call_once(g_pool_config_once, []() {
        const char * disable = std::getenv("MAKARNA_CUDA_POOL_DISABLE");
        if (env_true(disable)) {
            g_pool_enabled = false;
            g_pool_max_cached_bytes = 0;
            g_pool_max_block_bytes = 0;
            return;
        }
        const char * max_bytes = std::getenv("MAKARNA_CUDA_POOL_MAX_BYTES");
        const char * max_block = std::getenv("MAKARNA_CUDA_POOL_MAX_BLOCK_BYTES");
        g_pool_max_cached_bytes = parse_env_bytes(max_bytes, g_pool_max_cached_bytes);
        g_pool_max_block_bytes = parse_env_bytes(max_block, g_pool_max_block_bytes);
    });
}
 
 static device_pool & pool_for(int device) {
     if (device < 0) device = 0;
     if (device >= 16) device = device % 16;
     return g_pools[device];
 }
 
static void * pool_alloc(int device, size_t size) {
    init_pool_config();
    device_pool & p = pool_for(device);
    std::lock_guard<std::mutex> lock(p.mu);
 
     // Best-fit search: pick the smallest block that satisfies the request.
     size_t best_i = (size_t) -1;
     size_t best_size = (size_t) -1;
     for (size_t i = 0; i < p.free_list.size(); ++i) {
         const free_block & b = p.free_list[i];
         if (b.size >= size && b.size < best_size) {
             best_i = i;
             best_size = b.size;
         }
     }
    if (best_i != (size_t) -1) {
        void * ptr = p.free_list[best_i].ptr;
        size_t bsize = p.free_list[best_i].size;
        // erase by swap-with-back
        p.free_list[best_i] = p.free_list.back();
        p.free_list.pop_back();
        if (p.cached_bytes >= bsize) {
            p.cached_bytes -= bsize;
        } else {
            p.cached_bytes = 0;
        }
        return ptr;
    }
 
     return nullptr;
 }
 
 static void pool_record_alloc(int device, void * ptr, size_t size) {
     if (ptr == nullptr) return;
     device_pool & p = pool_for(device);
     std::lock_guard<std::mutex> lock(p.mu);
     p.alloc_sizes[ptr] = size;
 }
 
 static size_t pool_lookup_size(int device, void * ptr) {
     device_pool & p = pool_for(device);
     std::lock_guard<std::mutex> lock(p.mu);
     auto it = p.alloc_sizes.find(ptr);
     if (it == p.alloc_sizes.end()) {
         return 0;
     }
     return it->second;
 }

 static int pool_find_device(void * ptr, size_t * out_size) {
     if (out_size) *out_size = 0;
     if (ptr == nullptr) return -1;
     for (int d = 0; d < 16; ++d) {
         device_pool & p = g_pools[d];
         std::lock_guard<std::mutex> lock(p.mu);
         auto it = p.alloc_sizes.find(ptr);
         if (it != p.alloc_sizes.end()) {
             if (out_size) *out_size = it->second;
             return d;
         }
     }
     return -1;
 }
 
static void pool_free(int device, void * ptr) {
    init_pool_config();
    if (ptr == nullptr) return;
    size_t size = pool_lookup_size(device, ptr);
     int actual_device = device;
     if (size == 0) {
         int found = pool_find_device(ptr, &size);
         if (found >= 0) {
             actual_device = found;
         }
     }

    device_pool & p = pool_for(actual_device);
    std::lock_guard<std::mutex> lock(p.mu);
    if (!g_pool_enabled || g_pool_max_cached_bytes == 0 || g_pool_max_block_bytes == 0 || size > g_pool_max_block_bytes) {
        cudaSetDevice(actual_device);
        cudaFree(ptr);
        p.alloc_sizes.erase(ptr);
        return;
    }
    if (p.free_list.size() >= MAX_FREE_BLOCKS_PER_DEVICE || p.cached_bytes+size > g_pool_max_cached_bytes) {
        // Pool full: actually free.
        cudaSetDevice(actual_device);
        cudaFree(ptr);
        p.alloc_sizes.erase(ptr);
        return;
    }
    p.free_list.push_back(free_block{ptr, size});
    p.cached_bytes += size;
}
 
 } // namespace

int cuda_set_device(int id) {
    // cudaSetDevice is expensive when called repeatedly.
    // Cache per host thread since CUDA device context is thread-affine.
    if (tls_device == id) {
        return 0;
    }
    CHECK_CUDA(cudaSetDevice(id));
    tls_device = id;
    return 0;
}

void* cuda_malloc(size_t size) {
    init_pool_config();
    const int device = tls_device;
    void * ptr = pool_alloc(device, size);
    if (ptr != nullptr) {
        return ptr;
    }

    ptr = NULL;
    if (cudaMalloc(&ptr, size) != cudaSuccess) {
        return NULL;
    }
    pool_record_alloc(device, ptr, size);
    return ptr;
}

void cuda_free(void* ptr) {
    const int device = tls_device;
    pool_free(device, ptr);
}

int cuda_synchronize() {
    CHECK_CUDA(cudaDeviceSynchronize());
    return 0;
}

int cuda_memcpy_h2d(void* dst, void* src, size_t size) {
    CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));
    return 0;
}

int cuda_memcpy_d2h(void* dst, void* src, size_t size) {
    CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
    return 0;
}

int cuda_memcpy_d2d(void* dst, void* src, size_t size) {
    CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice));
    return 0;
}

int cuda_mem_info(size_t* free_bytes, size_t* total_bytes) {
    // cudaMemGetInfo can return cudaErrorOperatingSystem in some restricted
    // environments even though allocations/kernels work. Fall back to device
    // properties so higher-level placement logic can still function.
    cudaError_t err = cudaMemGetInfo(free_bytes, total_bytes);
    if (err == cudaSuccess) {
        return 0;
    }
    if (err == cudaErrorOperatingSystem) {
        // Some sandboxes block driver queries (MemGetInfo/GetDeviceProperties)
        // but still allow allocations. Approximate "free" with a probing alloc.
        (void)cudaGetLastError();

        size_t max_ok = 0;
        size_t probe = 256ULL << 20; // 256MB
        const size_t max_probe = 64ULL << 30; // 64GB cap

        void* p = nullptr;
        while (probe <= max_probe) {
            cudaError_t e = cudaMalloc(&p, probe);
            if (e == cudaSuccess) {
                (void)cudaFree(p);
                p = nullptr;
                max_ok = probe;
                probe <<= 1;
                continue;
            }
            (void)cudaGetLastError();
            break;
        }

        size_t lo = max_ok;
        size_t hi = probe;
        // Binary search to 64MB granularity.
        const size_t gran = 64ULL << 20;
        while (hi > lo + gran) {
            size_t mid = lo + (hi - lo) / 2;
            mid = (mid / (1ULL << 20)) * (1ULL << 20); // align to 1MB
            if (mid <= lo) {
                break;
            }
            cudaError_t e = cudaMalloc(&p, mid);
            if (e == cudaSuccess) {
                (void)cudaFree(p);
                p = nullptr;
                lo = mid;
            } else {
                (void)cudaGetLastError();
                hi = mid;
            }
        }

        if (free_bytes) {
            *free_bytes = lo;
        }
        if (total_bytes) {
            *total_bytes = lo;
        }
        return 0;
    }

    fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err));
    return 1;
}

int cuda_device_count(int* count) {
    int c = 0;
    CHECK_CUDA(cudaGetDeviceCount(&c));
    if (count) {
        *count = c;
    }
    return 0;
}