| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317 |
- #include "cuda_common.cuh"
- #include <cstdlib>
- #include <cstring>
- #include <mutex>
- #include <unordered_map>
- #include <vector>
- // --- Memory ---
- namespace {
-
- struct free_block {
- void * ptr;
- size_t size;
- };
-
- struct device_pool {
- std::mutex mu;
- std::unordered_map<void *, size_t> alloc_sizes;
- std::vector<free_block> free_list;
- size_t cached_bytes = 0;
- };
- // Current CUDA device cached per host thread.
- // This is updated by cuda_set_device and used by cuda_malloc/cuda_free.
- static thread_local int tls_device = 0;
-
- // Keep a small per-device cache of freed allocations to avoid cudaMalloc/cudaFree churn
- // and to keep VRAM usage stable after first-touch allocations.
- static device_pool g_pools[16];
- static constexpr size_t MAX_FREE_BLOCKS_PER_DEVICE = 1024;
- static size_t g_pool_max_cached_bytes = 512ULL << 20; // 512MB
- static size_t g_pool_max_block_bytes = 64ULL << 20; // 64MB
- static bool g_pool_enabled = true;
- static std::once_flag g_pool_config_once;
- static size_t parse_env_bytes(const char * env, size_t def_val) {
- if (env == nullptr || env[0] == '\0') {
- return def_val;
- }
- char * end = nullptr;
- unsigned long long val = std::strtoull(env, &end, 10);
- if (end != nullptr && *end != '\0') {
- switch (*end) {
- case 'k':
- case 'K':
- val *= 1024ULL;
- break;
- case 'm':
- case 'M':
- val *= 1024ULL * 1024ULL;
- break;
- case 'g':
- case 'G':
- val *= 1024ULL * 1024ULL * 1024ULL;
- break;
- default:
- break;
- }
- }
- return static_cast<size_t>(val);
- }
- static bool env_true(const char * env) {
- if (env == nullptr) {
- return false;
- }
- if (std::strcmp(env, "1") == 0 || std::strcmp(env, "true") == 0 || std::strcmp(env, "TRUE") == 0) {
- return true;
- }
- return false;
- }
- static void init_pool_config() {
- std::call_once(g_pool_config_once, []() {
- const char * disable = std::getenv("MAKARNA_CUDA_POOL_DISABLE");
- if (env_true(disable)) {
- g_pool_enabled = false;
- g_pool_max_cached_bytes = 0;
- g_pool_max_block_bytes = 0;
- return;
- }
- const char * max_bytes = std::getenv("MAKARNA_CUDA_POOL_MAX_BYTES");
- const char * max_block = std::getenv("MAKARNA_CUDA_POOL_MAX_BLOCK_BYTES");
- g_pool_max_cached_bytes = parse_env_bytes(max_bytes, g_pool_max_cached_bytes);
- g_pool_max_block_bytes = parse_env_bytes(max_block, g_pool_max_block_bytes);
- });
- }
-
- static device_pool & pool_for(int device) {
- if (device < 0) device = 0;
- if (device >= 16) device = device % 16;
- return g_pools[device];
- }
-
- static void * pool_alloc(int device, size_t size) {
- init_pool_config();
- device_pool & p = pool_for(device);
- std::lock_guard<std::mutex> lock(p.mu);
-
- // Best-fit search: pick the smallest block that satisfies the request.
- size_t best_i = (size_t) -1;
- size_t best_size = (size_t) -1;
- for (size_t i = 0; i < p.free_list.size(); ++i) {
- const free_block & b = p.free_list[i];
- if (b.size >= size && b.size < best_size) {
- best_i = i;
- best_size = b.size;
- }
- }
- if (best_i != (size_t) -1) {
- void * ptr = p.free_list[best_i].ptr;
- size_t bsize = p.free_list[best_i].size;
- // erase by swap-with-back
- p.free_list[best_i] = p.free_list.back();
- p.free_list.pop_back();
- if (p.cached_bytes >= bsize) {
- p.cached_bytes -= bsize;
- } else {
- p.cached_bytes = 0;
- }
- return ptr;
- }
-
- return nullptr;
- }
-
- static void pool_record_alloc(int device, void * ptr, size_t size) {
- if (ptr == nullptr) return;
- device_pool & p = pool_for(device);
- std::lock_guard<std::mutex> lock(p.mu);
- p.alloc_sizes[ptr] = size;
- }
-
- static size_t pool_lookup_size(int device, void * ptr) {
- device_pool & p = pool_for(device);
- std::lock_guard<std::mutex> lock(p.mu);
- auto it = p.alloc_sizes.find(ptr);
- if (it == p.alloc_sizes.end()) {
- return 0;
- }
- return it->second;
- }
- static int pool_find_device(void * ptr, size_t * out_size) {
- if (out_size) *out_size = 0;
- if (ptr == nullptr) return -1;
- for (int d = 0; d < 16; ++d) {
- device_pool & p = g_pools[d];
- std::lock_guard<std::mutex> lock(p.mu);
- auto it = p.alloc_sizes.find(ptr);
- if (it != p.alloc_sizes.end()) {
- if (out_size) *out_size = it->second;
- return d;
- }
- }
- return -1;
- }
-
- static void pool_free(int device, void * ptr) {
- init_pool_config();
- if (ptr == nullptr) return;
- size_t size = pool_lookup_size(device, ptr);
- int actual_device = device;
- if (size == 0) {
- int found = pool_find_device(ptr, &size);
- if (found >= 0) {
- actual_device = found;
- }
- }
- device_pool & p = pool_for(actual_device);
- std::lock_guard<std::mutex> lock(p.mu);
- if (!g_pool_enabled || g_pool_max_cached_bytes == 0 || g_pool_max_block_bytes == 0 || size > g_pool_max_block_bytes) {
- cudaSetDevice(actual_device);
- cudaFree(ptr);
- p.alloc_sizes.erase(ptr);
- return;
- }
- if (p.free_list.size() >= MAX_FREE_BLOCKS_PER_DEVICE || p.cached_bytes+size > g_pool_max_cached_bytes) {
- // Pool full: actually free.
- cudaSetDevice(actual_device);
- cudaFree(ptr);
- p.alloc_sizes.erase(ptr);
- return;
- }
- p.free_list.push_back(free_block{ptr, size});
- p.cached_bytes += size;
- }
-
- } // namespace
- int cuda_set_device(int id) {
- // cudaSetDevice is expensive when called repeatedly.
- // Cache per host thread since CUDA device context is thread-affine.
- if (tls_device == id) {
- return 0;
- }
- CHECK_CUDA(cudaSetDevice(id));
- tls_device = id;
- return 0;
- }
- void* cuda_malloc(size_t size) {
- init_pool_config();
- const int device = tls_device;
- void * ptr = pool_alloc(device, size);
- if (ptr != nullptr) {
- return ptr;
- }
- ptr = NULL;
- if (cudaMalloc(&ptr, size) != cudaSuccess) {
- return NULL;
- }
- pool_record_alloc(device, ptr, size);
- return ptr;
- }
- void cuda_free(void* ptr) {
- const int device = tls_device;
- pool_free(device, ptr);
- }
- int cuda_synchronize() {
- CHECK_CUDA(cudaDeviceSynchronize());
- return 0;
- }
- int cuda_memcpy_h2d(void* dst, void* src, size_t size) {
- CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));
- return 0;
- }
- int cuda_memcpy_d2h(void* dst, void* src, size_t size) {
- CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
- return 0;
- }
- int cuda_memcpy_d2d(void* dst, void* src, size_t size) {
- CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice));
- return 0;
- }
- int cuda_mem_info(size_t* free_bytes, size_t* total_bytes) {
- // cudaMemGetInfo can return cudaErrorOperatingSystem in some restricted
- // environments even though allocations/kernels work. Fall back to device
- // properties so higher-level placement logic can still function.
- cudaError_t err = cudaMemGetInfo(free_bytes, total_bytes);
- if (err == cudaSuccess) {
- return 0;
- }
- if (err == cudaErrorOperatingSystem) {
- // Some sandboxes block driver queries (MemGetInfo/GetDeviceProperties)
- // but still allow allocations. Approximate "free" with a probing alloc.
- (void)cudaGetLastError();
- size_t max_ok = 0;
- size_t probe = 256ULL << 20; // 256MB
- const size_t max_probe = 64ULL << 30; // 64GB cap
- void* p = nullptr;
- while (probe <= max_probe) {
- cudaError_t e = cudaMalloc(&p, probe);
- if (e == cudaSuccess) {
- (void)cudaFree(p);
- p = nullptr;
- max_ok = probe;
- probe <<= 1;
- continue;
- }
- (void)cudaGetLastError();
- break;
- }
- size_t lo = max_ok;
- size_t hi = probe;
- // Binary search to 64MB granularity.
- const size_t gran = 64ULL << 20;
- while (hi > lo + gran) {
- size_t mid = lo + (hi - lo) / 2;
- mid = (mid / (1ULL << 20)) * (1ULL << 20); // align to 1MB
- if (mid <= lo) {
- break;
- }
- cudaError_t e = cudaMalloc(&p, mid);
- if (e == cudaSuccess) {
- (void)cudaFree(p);
- p = nullptr;
- lo = mid;
- } else {
- (void)cudaGetLastError();
- hi = mid;
- }
- }
- if (free_bytes) {
- *free_bytes = lo;
- }
- if (total_bytes) {
- *total_bytes = lo;
- }
- return 0;
- }
- fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err));
- return 1;
- }
- int cuda_device_count(int* count) {
- int c = 0;
- CHECK_CUDA(cudaGetDeviceCount(&c));
- if (count) {
- *count = c;
- }
- return 0;
- }
|