#include "cuda_common.cuh" #include #include #include #include #include // --- Memory --- namespace { struct free_block { void * ptr; size_t size; }; struct device_pool { std::mutex mu; std::unordered_map alloc_sizes; std::vector free_list; size_t cached_bytes = 0; }; // Current CUDA device cached per host thread. // This is updated by cuda_set_device and used by cuda_malloc/cuda_free. static thread_local int tls_device = 0; // Keep a small per-device cache of freed allocations to avoid cudaMalloc/cudaFree churn // and to keep VRAM usage stable after first-touch allocations. static device_pool g_pools[16]; static constexpr size_t MAX_FREE_BLOCKS_PER_DEVICE = 1024; static size_t g_pool_max_cached_bytes = 512ULL << 20; // 512MB static size_t g_pool_max_block_bytes = 64ULL << 20; // 64MB static bool g_pool_enabled = true; static std::once_flag g_pool_config_once; static size_t parse_env_bytes(const char * env, size_t def_val) { if (env == nullptr || env[0] == '\0') { return def_val; } char * end = nullptr; unsigned long long val = std::strtoull(env, &end, 10); if (end != nullptr && *end != '\0') { switch (*end) { case 'k': case 'K': val *= 1024ULL; break; case 'm': case 'M': val *= 1024ULL * 1024ULL; break; case 'g': case 'G': val *= 1024ULL * 1024ULL * 1024ULL; break; default: break; } } return static_cast(val); } static bool env_true(const char * env) { if (env == nullptr) { return false; } if (std::strcmp(env, "1") == 0 || std::strcmp(env, "true") == 0 || std::strcmp(env, "TRUE") == 0) { return true; } return false; } static void init_pool_config() { std::call_once(g_pool_config_once, []() { const char * disable = std::getenv("MAKARNA_CUDA_POOL_DISABLE"); if (env_true(disable)) { g_pool_enabled = false; g_pool_max_cached_bytes = 0; g_pool_max_block_bytes = 0; return; } const char * max_bytes = std::getenv("MAKARNA_CUDA_POOL_MAX_BYTES"); const char * max_block = std::getenv("MAKARNA_CUDA_POOL_MAX_BLOCK_BYTES"); g_pool_max_cached_bytes = parse_env_bytes(max_bytes, g_pool_max_cached_bytes); g_pool_max_block_bytes = parse_env_bytes(max_block, g_pool_max_block_bytes); }); } static device_pool & pool_for(int device) { if (device < 0) device = 0; if (device >= 16) device = device % 16; return g_pools[device]; } static void * pool_alloc(int device, size_t size) { init_pool_config(); device_pool & p = pool_for(device); std::lock_guard lock(p.mu); // Best-fit search: pick the smallest block that satisfies the request. size_t best_i = (size_t) -1; size_t best_size = (size_t) -1; for (size_t i = 0; i < p.free_list.size(); ++i) { const free_block & b = p.free_list[i]; if (b.size >= size && b.size < best_size) { best_i = i; best_size = b.size; } } if (best_i != (size_t) -1) { void * ptr = p.free_list[best_i].ptr; size_t bsize = p.free_list[best_i].size; // erase by swap-with-back p.free_list[best_i] = p.free_list.back(); p.free_list.pop_back(); if (p.cached_bytes >= bsize) { p.cached_bytes -= bsize; } else { p.cached_bytes = 0; } return ptr; } return nullptr; } static void pool_record_alloc(int device, void * ptr, size_t size) { if (ptr == nullptr) return; device_pool & p = pool_for(device); std::lock_guard lock(p.mu); p.alloc_sizes[ptr] = size; } static size_t pool_lookup_size(int device, void * ptr) { device_pool & p = pool_for(device); std::lock_guard lock(p.mu); auto it = p.alloc_sizes.find(ptr); if (it == p.alloc_sizes.end()) { return 0; } return it->second; } static int pool_find_device(void * ptr, size_t * out_size) { if (out_size) *out_size = 0; if (ptr == nullptr) return -1; for (int d = 0; d < 16; ++d) { device_pool & p = g_pools[d]; std::lock_guard lock(p.mu); auto it = p.alloc_sizes.find(ptr); if (it != p.alloc_sizes.end()) { if (out_size) *out_size = it->second; return d; } } return -1; } static void pool_free(int device, void * ptr) { init_pool_config(); if (ptr == nullptr) return; size_t size = pool_lookup_size(device, ptr); int actual_device = device; if (size == 0) { int found = pool_find_device(ptr, &size); if (found >= 0) { actual_device = found; } } device_pool & p = pool_for(actual_device); std::lock_guard lock(p.mu); if (!g_pool_enabled || g_pool_max_cached_bytes == 0 || g_pool_max_block_bytes == 0 || size > g_pool_max_block_bytes) { cudaSetDevice(actual_device); cudaFree(ptr); p.alloc_sizes.erase(ptr); return; } if (p.free_list.size() >= MAX_FREE_BLOCKS_PER_DEVICE || p.cached_bytes+size > g_pool_max_cached_bytes) { // Pool full: actually free. cudaSetDevice(actual_device); cudaFree(ptr); p.alloc_sizes.erase(ptr); return; } p.free_list.push_back(free_block{ptr, size}); p.cached_bytes += size; } } // namespace int cuda_set_device(int id) { // cudaSetDevice is expensive when called repeatedly. // Cache per host thread since CUDA device context is thread-affine. if (tls_device == id) { return 0; } CHECK_CUDA(cudaSetDevice(id)); tls_device = id; return 0; } void* cuda_malloc(size_t size) { init_pool_config(); const int device = tls_device; void * ptr = pool_alloc(device, size); if (ptr != nullptr) { return ptr; } ptr = NULL; if (cudaMalloc(&ptr, size) != cudaSuccess) { return NULL; } pool_record_alloc(device, ptr, size); return ptr; } void cuda_free(void* ptr) { const int device = tls_device; pool_free(device, ptr); } int cuda_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); return 0; } int cuda_memcpy_h2d(void* dst, void* src, size_t size) { CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); return 0; } int cuda_memcpy_d2h(void* dst, void* src, size_t size) { CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); return 0; } int cuda_memcpy_d2d(void* dst, void* src, size_t size) { CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice)); return 0; } int cuda_mem_info(size_t* free_bytes, size_t* total_bytes) { // cudaMemGetInfo can return cudaErrorOperatingSystem in some restricted // environments even though allocations/kernels work. Fall back to device // properties so higher-level placement logic can still function. cudaError_t err = cudaMemGetInfo(free_bytes, total_bytes); if (err == cudaSuccess) { return 0; } if (err == cudaErrorOperatingSystem) { // Some sandboxes block driver queries (MemGetInfo/GetDeviceProperties) // but still allow allocations. Approximate "free" with a probing alloc. (void)cudaGetLastError(); size_t max_ok = 0; size_t probe = 256ULL << 20; // 256MB const size_t max_probe = 64ULL << 30; // 64GB cap void* p = nullptr; while (probe <= max_probe) { cudaError_t e = cudaMalloc(&p, probe); if (e == cudaSuccess) { (void)cudaFree(p); p = nullptr; max_ok = probe; probe <<= 1; continue; } (void)cudaGetLastError(); break; } size_t lo = max_ok; size_t hi = probe; // Binary search to 64MB granularity. const size_t gran = 64ULL << 20; while (hi > lo + gran) { size_t mid = lo + (hi - lo) / 2; mid = (mid / (1ULL << 20)) * (1ULL << 20); // align to 1MB if (mid <= lo) { break; } cudaError_t e = cudaMalloc(&p, mid); if (e == cudaSuccess) { (void)cudaFree(p); p = nullptr; lo = mid; } else { (void)cudaGetLastError(); hi = mid; } } if (free_bytes) { *free_bytes = lo; } if (total_bytes) { *total_bytes = lo; } return 0; } fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); return 1; } int cuda_device_count(int* count) { int c = 0; CHECK_CUDA(cudaGetDeviceCount(&c)); if (count) { *count = c; } return 0; }