ggml-alloc.c 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027
  1. #include "ggml-alloc.h"
  2. #include "ggml-backend-impl.h"
  3. #include "ggml.h"
  4. #include "ggml-impl.h"
  5. #include <assert.h>
  6. #include <limits.h>
  7. #include <stdarg.h>
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  12. #define MAX_FREE_BLOCKS 256
  13. //#define GGML_ALLOCATOR_DEBUG
  14. //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
  15. #define AT_PRINTF(...)
  16. static bool ggml_is_view(const struct ggml_tensor * t) {
  17. return t->view_src != NULL;
  18. }
  19. // ops that return true for this function must not use restrict pointers for their backend implementations
  20. static bool ggml_op_can_inplace(enum ggml_op op) {
  21. switch (op) {
  22. case GGML_OP_SCALE:
  23. case GGML_OP_DIAG_MASK_ZERO:
  24. case GGML_OP_DIAG_MASK_INF:
  25. case GGML_OP_ADD:
  26. case GGML_OP_ADD1:
  27. case GGML_OP_SUB:
  28. case GGML_OP_MUL:
  29. case GGML_OP_DIV:
  30. case GGML_OP_SQR:
  31. case GGML_OP_SQRT:
  32. case GGML_OP_LOG:
  33. case GGML_OP_UNARY:
  34. case GGML_OP_ROPE:
  35. case GGML_OP_ROPE_BACK:
  36. case GGML_OP_SILU_BACK:
  37. case GGML_OP_RMS_NORM:
  38. case GGML_OP_RMS_NORM_BACK:
  39. case GGML_OP_SOFT_MAX:
  40. case GGML_OP_SOFT_MAX_BACK:
  41. return true;
  42. default:
  43. return false;
  44. }
  45. }
  46. static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
  47. assert(alignment && !(alignment & (alignment - 1))); // power of 2
  48. size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
  49. return offset + align;
  50. }
  51. // tallocr
  52. struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
  53. void * base = ggml_backend_buffer_get_base(buffer);
  54. size_t align = ggml_backend_buffer_get_alignment(buffer);
  55. assert(align && !(align & (align - 1))); // power of 2
  56. struct ggml_tallocr talloc = (struct ggml_tallocr) {
  57. /*.buffer = */ buffer,
  58. /*.base = */ base,
  59. /*.alignment = */ align,
  60. /*.offset = */ aligned_offset(base, 0, align),
  61. };
  62. return talloc;
  63. }
  64. enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
  65. size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
  66. size = GGML_PAD(size, talloc->alignment);
  67. if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
  68. GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
  69. __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
  70. GGML_ABORT("not enough space in the buffer");
  71. }
  72. void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
  73. talloc->offset += size;
  74. assert(((uintptr_t)addr % talloc->alignment) == 0);
  75. return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
  76. }
  77. // dynamic tensor allocator
  78. struct free_block {
  79. size_t offset;
  80. size_t size;
  81. };
  82. struct ggml_dyn_tallocr {
  83. size_t alignment;
  84. int n_free_blocks;
  85. struct free_block free_blocks[MAX_FREE_BLOCKS];
  86. size_t max_size;
  87. #ifdef GGML_ALLOCATOR_DEBUG
  88. struct {
  89. const struct ggml_tensor * tensor;
  90. size_t offset;
  91. } allocated_tensors[1024];
  92. #endif
  93. };
  94. #ifdef GGML_ALLOCATOR_DEBUG
  95. static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
  96. for (int i = 0; i < 1024; i++) {
  97. if (alloc->allocated_tensors[i].tensor == NULL) {
  98. alloc->allocated_tensors[i].tensor = tensor;
  99. alloc->allocated_tensors[i].offset = offset;
  100. return;
  101. }
  102. }
  103. GGML_ABORT("out of allocated_tensors");
  104. }
  105. static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
  106. for (int i = 0; i < 1024; i++) {
  107. if (alloc->allocated_tensors[i].offset == offset) {
  108. alloc->allocated_tensors[i].tensor = NULL;
  109. return;
  110. }
  111. }
  112. GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
  113. }
  114. #endif
  115. static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
  116. size = aligned_offset(NULL, size, alloc->alignment);
  117. AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
  118. size_t max_avail = 0;
  119. // find the best fitting free block besides the last block
  120. int best_fit_block = -1;
  121. size_t best_fit_size = SIZE_MAX;
  122. for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
  123. struct free_block * block = &alloc->free_blocks[i];
  124. max_avail = MAX(max_avail, block->size);
  125. if (block->size >= size && block->size <= best_fit_size) {
  126. best_fit_block = i;
  127. best_fit_size = block->size;
  128. }
  129. }
  130. if (best_fit_block == -1) {
  131. // the last block is our last resort
  132. struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
  133. max_avail = MAX(max_avail, block->size);
  134. if (block->size >= size) {
  135. best_fit_block = alloc->n_free_blocks - 1;
  136. } else {
  137. // this should never happen
  138. GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
  139. __func__, size, max_avail);
  140. GGML_ABORT("not enough space in the buffer");
  141. }
  142. }
  143. struct free_block * block = &alloc->free_blocks[best_fit_block];
  144. size_t offset = block->offset;
  145. block->offset = offset + size;
  146. block->size -= size;
  147. if (block->size == 0) {
  148. // remove block if empty
  149. alloc->n_free_blocks--;
  150. for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
  151. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  152. }
  153. }
  154. AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
  155. #ifdef GGML_ALLOCATOR_DEBUG
  156. add_allocated_tensor(alloc, offset, tensor);
  157. size_t cur_max = offset + size;
  158. if (cur_max > alloc->max_size) {
  159. // sort allocated_tensors by offset
  160. for (int i = 0; i < 1024; i++) {
  161. for (int j = i + 1; j < 1024; j++) {
  162. if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
  163. const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
  164. size_t tmp_offset = alloc->allocated_tensors[i].offset;
  165. alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
  166. alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
  167. alloc->allocated_tensors[j].tensor = tmp_tensor;
  168. alloc->allocated_tensors[j].offset = tmp_offset;
  169. }
  170. }
  171. }
  172. GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
  173. for (int i = 0; i < 1024; i++) {
  174. if (alloc->allocated_tensors[i].tensor) {
  175. GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
  176. alloc->allocated_tensors[i].offset,
  177. alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
  178. ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
  179. }
  180. }
  181. GGML_LOG_DEBUG("\n");
  182. }
  183. #endif
  184. alloc->max_size = MAX(alloc->max_size, offset + size);
  185. return offset;
  186. GGML_UNUSED(tensor);
  187. }
  188. // this is a very naive implementation, but for our case the number of free blocks should be very small
  189. static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
  190. size = aligned_offset(NULL, size, alloc->alignment);
  191. AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
  192. #ifdef GGML_ALLOCATOR_DEBUG
  193. remove_allocated_tensor(alloc, offset, tensor);
  194. #endif
  195. // see if we can merge with an existing block
  196. for (int i = 0; i < alloc->n_free_blocks; i++) {
  197. struct free_block * block = &alloc->free_blocks[i];
  198. // check if ptr is at the end of the block
  199. if (block->offset + block->size == offset) {
  200. block->size += size;
  201. // check if we can merge with the next block
  202. if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
  203. block->size += alloc->free_blocks[i+1].size;
  204. alloc->n_free_blocks--;
  205. for (int j = i+1; j < alloc->n_free_blocks; j++) {
  206. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  207. }
  208. }
  209. return;
  210. }
  211. // check if ptr is at the beginning of the block
  212. if (offset + size == block->offset) {
  213. block->offset = offset;
  214. block->size += size;
  215. // check if we can merge with the previous block
  216. if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
  217. alloc->free_blocks[i-1].size += block->size;
  218. alloc->n_free_blocks--;
  219. for (int j = i; j < alloc->n_free_blocks; j++) {
  220. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  221. }
  222. }
  223. return;
  224. }
  225. }
  226. // otherwise, add a new block
  227. GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
  228. // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
  229. int insert_pos = 0;
  230. while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
  231. insert_pos++;
  232. }
  233. // shift all blocks from insert_pos onward to make room for the new block
  234. for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
  235. alloc->free_blocks[i] = alloc->free_blocks[i-1];
  236. }
  237. // insert the new block
  238. alloc->free_blocks[insert_pos].offset = offset;
  239. alloc->free_blocks[insert_pos].size = size;
  240. alloc->n_free_blocks++;
  241. GGML_UNUSED(tensor);
  242. }
  243. static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
  244. alloc->n_free_blocks = 1;
  245. alloc->free_blocks[0].offset = 0;
  246. alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
  247. alloc->max_size = 0;
  248. #ifdef GGML_ALLOCATOR_DEBUG
  249. for (int i = 0; i < 1024; i++) {
  250. alloc->allocated_tensors[i].tensor = NULL;
  251. }
  252. #endif
  253. }
  254. static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
  255. struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
  256. *alloc = (struct ggml_dyn_tallocr) {
  257. /*.alignment = */ alignment,
  258. /*.n_free_blocks = */ 0,
  259. /*.free_blocks = */ {{0}},
  260. /*.max_size = */ 0,
  261. #ifdef GGML_ALLOCATOR_DEBUG
  262. /*.allocated_tensors = */ {{0}},
  263. #endif
  264. };
  265. ggml_dyn_tallocr_reset(alloc);
  266. return alloc;
  267. }
  268. static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
  269. free(alloc);
  270. }
  271. static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
  272. return alloc->max_size;
  273. }
  274. /////////////////////////////////////
  275. // graph allocator
  276. struct hash_node {
  277. int n_children;
  278. int n_views;
  279. int buffer_id;
  280. size_t offset; // offset within the buffer
  281. bool allocated;
  282. };
  283. struct tensor_alloc {
  284. int buffer_id;
  285. size_t offset;
  286. size_t size_max; // 0 = pre-allocated, unused, or view
  287. };
  288. struct leaf_alloc {
  289. struct tensor_alloc leaf;
  290. };
  291. struct node_alloc {
  292. struct tensor_alloc dst;
  293. struct tensor_alloc src[GGML_MAX_SRC];
  294. };
  295. struct ggml_gallocr {
  296. ggml_backend_buffer_type_t * bufts; // [n_buffers]
  297. ggml_backend_buffer_t * buffers; // [n_buffers]
  298. struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
  299. int n_buffers;
  300. struct ggml_hash_set hash_set;
  301. struct hash_node * hash_values; // [hash_set.size]
  302. struct node_alloc * node_allocs; // [n_nodes]
  303. int n_nodes;
  304. struct leaf_alloc * leaf_allocs; // [n_leafs]
  305. int n_leafs;
  306. };
  307. ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
  308. ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
  309. GGML_ASSERT(galloc != NULL);
  310. galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
  311. GGML_ASSERT(galloc->bufts != NULL);
  312. galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
  313. GGML_ASSERT(galloc->buffers != NULL);
  314. galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
  315. GGML_ASSERT(galloc->buf_tallocs != NULL);
  316. for (int i = 0; i < n_bufs; i++) {
  317. galloc->bufts[i] = bufts[i];
  318. galloc->buffers[i] = NULL;
  319. // check if the same buffer type is used multiple times and reuse the same allocator
  320. for (int j = 0; j < i; j++) {
  321. if (bufts[i] == bufts[j]) {
  322. galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
  323. break;
  324. }
  325. }
  326. if (galloc->buf_tallocs[i] == NULL) {
  327. size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
  328. galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
  329. }
  330. }
  331. galloc->n_buffers = n_bufs;
  332. return galloc;
  333. }
  334. ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
  335. return ggml_gallocr_new_n(&buft, 1);
  336. }
  337. void ggml_gallocr_free(ggml_gallocr_t galloc) {
  338. if (galloc == NULL) {
  339. return;
  340. }
  341. for (int i = 0; i < galloc->n_buffers; i++) {
  342. if (galloc->buffers != NULL) {
  343. // skip if already freed
  344. bool freed = false;
  345. for (int j = 0; j < i; j++) {
  346. if (galloc->buffers[j] == galloc->buffers[i]) {
  347. freed = true;
  348. break;
  349. }
  350. }
  351. if (!freed) {
  352. ggml_backend_buffer_free(galloc->buffers[i]);
  353. }
  354. }
  355. if (galloc->buf_tallocs != NULL) {
  356. // skip if already freed
  357. bool freed = false;
  358. for (int j = 0; j < i; j++) {
  359. if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
  360. freed = true;
  361. break;
  362. }
  363. }
  364. if (!freed) {
  365. ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
  366. }
  367. }
  368. }
  369. ggml_hash_set_free(&galloc->hash_set);
  370. free(galloc->hash_values);
  371. free(galloc->bufts);
  372. free(galloc->buffers);
  373. free(galloc->buf_tallocs);
  374. free(galloc->node_allocs);
  375. free(galloc->leaf_allocs);
  376. free(galloc);
  377. }
  378. typedef struct ggml_gallocr * ggml_gallocr_t;
  379. static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  380. size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
  381. return &galloc->hash_values[i];
  382. }
  383. static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  384. return ggml_gallocr_hash_get(galloc, t)->allocated;
  385. }
  386. static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  387. return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
  388. }
  389. static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
  390. GGML_ASSERT(buffer_id >= 0);
  391. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  392. if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
  393. hn->allocated = true;
  394. assert(hn->offset == 0);
  395. // try to reuse a parent's buffer (inplace)
  396. if (ggml_op_can_inplace(node->op)) {
  397. for (int i = 0; i < GGML_MAX_SRC; i++) {
  398. struct ggml_tensor * parent = node->src[i];
  399. if (parent == NULL) {
  400. continue;
  401. }
  402. // if the node's data is external, then we cannot re-use it
  403. if (!ggml_gallocr_is_own(galloc, parent)) {
  404. AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
  405. continue;
  406. }
  407. // outputs cannot be reused
  408. if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
  409. AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
  410. continue;
  411. }
  412. if (!ggml_are_same_layout(node, parent)) {
  413. AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
  414. continue;
  415. }
  416. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  417. if (p_hn->n_children == 1 && p_hn->n_views == 0) {
  418. if (ggml_is_view(parent)) {
  419. struct ggml_tensor * view_src = parent->view_src;
  420. struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
  421. if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
  422. AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
  423. assert(view_src_hn->offset == p_hn->offset);
  424. hn->buffer_id = p_hn->buffer_id;
  425. hn->offset = p_hn->offset;
  426. p_hn->allocated = false; // avoid freeing the parent
  427. view_src_hn->allocated = false;
  428. return;
  429. }
  430. } else {
  431. AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
  432. hn->buffer_id = p_hn->buffer_id;
  433. hn->offset = p_hn->offset;
  434. p_hn->allocated = false; // avoid freeing the parent
  435. return;
  436. }
  437. }
  438. }
  439. }
  440. // allocate tensor from the buffer
  441. struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
  442. ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
  443. size_t size = ggml_backend_buft_get_alloc_size(buft, node);
  444. size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
  445. hn->buffer_id = buffer_id;
  446. hn->offset = offset;
  447. }
  448. }
  449. static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
  450. // graph outputs are never freed
  451. if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
  452. AT_PRINTF("not freeing output %s\n", node->name);
  453. return;
  454. }
  455. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  456. size_t offset = hn->offset;
  457. int buffer_id = hn->buffer_id;
  458. struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
  459. ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
  460. size_t size = ggml_backend_buft_get_alloc_size(buft, node);
  461. ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
  462. hn->allocated = false;
  463. }
  464. static int get_node_buffer_id(const int * node_buffer_ids, int i) {
  465. return node_buffer_ids ? node_buffer_ids[i] : 0;
  466. }
  467. static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
  468. // clear hash tables
  469. ggml_hash_set_reset(&galloc->hash_set);
  470. memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
  471. // allocate leafs
  472. // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
  473. for (int i = 0; i < graph->n_leafs; i++) {
  474. struct ggml_tensor * leaf = graph->leafs[i];
  475. ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
  476. }
  477. // count number of children and views
  478. // allocate other graph inputs and leafs first to avoid overwriting them
  479. for (int i = 0; i < graph->n_nodes; i++) {
  480. struct ggml_tensor * node = graph->nodes[i];
  481. // TODO: better way to add external dependencies
  482. // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
  483. // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
  484. // itself is never used and should not be considered a dependency
  485. if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
  486. struct ggml_tensor * view_src = node->view_src;
  487. ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
  488. }
  489. if (node->flags & GGML_TENSOR_FLAG_INPUT) {
  490. ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
  491. }
  492. for (int j = 0; j < GGML_MAX_SRC; j++) {
  493. struct ggml_tensor * src = node->src[j];
  494. if (src == NULL) {
  495. continue;
  496. }
  497. ggml_gallocr_hash_get(galloc, src)->n_children += 1;
  498. // allocate explicit inputs
  499. if (src->flags & GGML_TENSOR_FLAG_INPUT) {
  500. ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
  501. }
  502. }
  503. }
  504. // allocate tensors
  505. for (int i = 0; i < graph->n_nodes; i++) {
  506. struct ggml_tensor * node = graph->nodes[i];
  507. int buffer_id = get_node_buffer_id(node_buffer_ids, i);
  508. // allocate parents (only leafs need to be allocated at this point)
  509. for (int j = 0; j < GGML_MAX_SRC; j++) {
  510. struct ggml_tensor * parent = node->src[j];
  511. if (parent == NULL) {
  512. continue;
  513. }
  514. ggml_gallocr_allocate_node(galloc, parent, buffer_id);
  515. }
  516. // allocate node
  517. ggml_gallocr_allocate_node(galloc, node, buffer_id);
  518. AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
  519. for (int j = 0; j < GGML_MAX_SRC; j++) {
  520. struct ggml_tensor * parent = node->src[j];
  521. if (parent == NULL) {
  522. continue;
  523. }
  524. AT_PRINTF("%s", parent->name);
  525. if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
  526. AT_PRINTF(", ");
  527. }
  528. }
  529. AT_PRINTF("\n");
  530. // update parents
  531. for (int j = 0; j < GGML_MAX_SRC; j++) {
  532. struct ggml_tensor * parent = node->src[j];
  533. if (parent == NULL) {
  534. continue;
  535. }
  536. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  537. p_hn->n_children -= 1;
  538. AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
  539. parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
  540. if (p_hn->n_children == 0 && p_hn->n_views == 0) {
  541. if (ggml_is_view(parent)) {
  542. struct ggml_tensor * view_src = parent->view_src;
  543. struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
  544. view_src_hn->n_views -= 1;
  545. AT_PRINTF("view_src %s: %d children, %d views\n",
  546. view_src->name, view_src_hn->n_children, view_src_hn->n_views);
  547. if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
  548. ggml_gallocr_free_node(galloc, view_src);
  549. }
  550. }
  551. else if (p_hn->allocated) {
  552. ggml_gallocr_free_node(galloc, parent);
  553. }
  554. }
  555. AT_PRINTF("\n");
  556. }
  557. }
  558. }
  559. bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
  560. size_t min_hash_size = graph->n_nodes + graph->n_leafs;
  561. // add 25% margin to avoid hash collisions
  562. min_hash_size += min_hash_size / 4;
  563. // initialize hash table
  564. if (galloc->hash_set.size < min_hash_size) {
  565. ggml_hash_set_free(&galloc->hash_set);
  566. galloc->hash_set = ggml_hash_set_new(min_hash_size);
  567. GGML_ASSERT(galloc->hash_set.keys != NULL);
  568. free(galloc->hash_values);
  569. galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
  570. GGML_ASSERT(galloc->hash_values != NULL);
  571. }
  572. // reset allocators
  573. for (int i = 0; i < galloc->n_buffers; i++) {
  574. ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
  575. }
  576. // allocate in hash table
  577. ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
  578. // set the node_allocs from the hash table
  579. if (galloc->n_nodes < graph->n_nodes) {
  580. free(galloc->node_allocs);
  581. galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
  582. GGML_ASSERT(galloc->node_allocs != NULL);
  583. }
  584. galloc->n_nodes = graph->n_nodes;
  585. for (int i = 0; i < graph->n_nodes; i++) {
  586. struct ggml_tensor * node = graph->nodes[i];
  587. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  588. if (node->view_src || node->data) {
  589. node_alloc->dst.buffer_id = -1;
  590. node_alloc->dst.offset = SIZE_MAX;
  591. node_alloc->dst.size_max = 0;
  592. } else {
  593. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  594. node_alloc->dst.buffer_id = hn->buffer_id;
  595. node_alloc->dst.offset = hn->offset;
  596. node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
  597. }
  598. for (int j = 0; j < GGML_MAX_SRC; j++) {
  599. struct ggml_tensor * src = node->src[j];
  600. if (!src || src->view_src || src->data) {
  601. node_alloc->src[j].buffer_id = -1;
  602. node_alloc->src[j].offset = SIZE_MAX;
  603. node_alloc->src[j].size_max = 0;
  604. } else {
  605. struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
  606. node_alloc->src[j].buffer_id = hn->buffer_id;
  607. node_alloc->src[j].offset = hn->offset;
  608. node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
  609. }
  610. }
  611. }
  612. if (galloc->n_leafs < graph->n_leafs) {
  613. free(galloc->leaf_allocs);
  614. galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
  615. GGML_ASSERT(galloc->leaf_allocs != NULL);
  616. }
  617. galloc->n_leafs = graph->n_leafs;
  618. for (int i = 0; i < graph->n_leafs; i++) {
  619. struct ggml_tensor * leaf = graph->leafs[i];
  620. struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
  621. if (leaf->view_src || leaf->data) {
  622. galloc->leaf_allocs[i].leaf.buffer_id = -1;
  623. galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
  624. galloc->leaf_allocs[i].leaf.size_max = 0;
  625. } else {
  626. galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
  627. galloc->leaf_allocs[i].leaf.offset = hn->offset;
  628. galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
  629. }
  630. }
  631. // reallocate buffers if needed
  632. for (int i = 0; i < galloc->n_buffers; i++) {
  633. // if the buffer type is used multiple times, we reuse the same buffer
  634. for (int j = 0; j < i; j++) {
  635. if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
  636. galloc->buffers[i] = galloc->buffers[j];
  637. break;
  638. }
  639. }
  640. size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
  641. size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
  642. // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
  643. if (new_size > cur_size || galloc->buffers[i] == NULL) {
  644. #ifndef NDEBUG
  645. GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  646. #endif
  647. ggml_backend_buffer_free(galloc->buffers[i]);
  648. galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
  649. if (galloc->buffers[i] == NULL) {
  650. GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
  651. return false;
  652. }
  653. ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
  654. }
  655. }
  656. return true;
  657. }
  658. bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
  659. return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
  660. }
  661. static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
  662. int buffer_id = tensor_alloc->buffer_id;
  663. assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
  664. if (tensor->view_src != NULL) {
  665. if (tensor->buffer == NULL) {
  666. assert(tensor_alloc->offset == SIZE_MAX);
  667. if (tensor->view_src->buffer == NULL) {
  668. // this tensor was allocated without ggml-backend
  669. return;
  670. }
  671. ggml_backend_view_init(tensor);
  672. }
  673. } else {
  674. if (tensor->data == NULL) {
  675. assert(tensor_alloc->offset != SIZE_MAX);
  676. assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
  677. void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
  678. void * addr = (char *)base + tensor_alloc->offset;
  679. ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
  680. } else {
  681. if (tensor->buffer == NULL) {
  682. // this tensor was allocated without ggml-backend
  683. return;
  684. }
  685. }
  686. }
  687. }
  688. static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
  689. size_t node_size = 0;
  690. if (!node->data && !node->view_src) {
  691. // If we previously had data but don't now then reallocate
  692. if (talloc->buffer_id < 0) {
  693. return false;
  694. }
  695. node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
  696. }
  697. return talloc->size_max >= node_size;
  698. }
  699. static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
  700. if (galloc->n_nodes != graph->n_nodes) {
  701. #ifndef NDEBUG
  702. GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
  703. #endif
  704. return true;
  705. }
  706. if (galloc->n_leafs != graph->n_leafs) {
  707. #ifndef NDEBUG
  708. GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
  709. #endif
  710. return true;
  711. }
  712. for (int i = 0; i < graph->n_nodes; i++) {
  713. struct ggml_tensor * node = graph->nodes[i];
  714. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  715. if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
  716. #ifndef NDEBUG
  717. GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
  718. #endif
  719. return true;
  720. }
  721. for (int j = 0; j < GGML_MAX_SRC; j++) {
  722. struct ggml_tensor * src = node->src[j];
  723. if (src == NULL) {
  724. continue;
  725. }
  726. if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
  727. #ifndef NDEBUG
  728. GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
  729. #endif
  730. return true;
  731. }
  732. }
  733. }
  734. return false;
  735. }
  736. bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
  737. if (ggml_gallocr_needs_realloc(galloc, graph)) {
  738. if (galloc->n_buffers == 1) {
  739. #ifndef NDEBUG
  740. GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
  741. #endif
  742. if (!ggml_gallocr_reserve(galloc, graph)) {
  743. return false;
  744. }
  745. } else {
  746. #ifndef NDEBUG
  747. GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
  748. #endif
  749. return false;
  750. }
  751. }
  752. // reset buffers
  753. for (int i = 0; i < galloc->n_buffers; i++) {
  754. if (galloc->buffers[i] != NULL) {
  755. ggml_backend_buffer_reset(galloc->buffers[i]);
  756. }
  757. }
  758. // allocate the graph tensors from the previous assignments
  759. // leafs
  760. for (int i = 0; i < graph->n_leafs; i++) {
  761. struct ggml_tensor * leaf = graph->leafs[i];
  762. struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
  763. ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
  764. }
  765. // nodes
  766. for (int i = 0; i < graph->n_nodes; i++) {
  767. struct ggml_tensor * node = graph->nodes[i];
  768. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  769. for (int j = 0; j < GGML_MAX_SRC; j++) {
  770. struct ggml_tensor * src = node->src[j];
  771. if (src == NULL) {
  772. continue;
  773. }
  774. ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
  775. }
  776. ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
  777. }
  778. return true;
  779. }
  780. size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
  781. GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
  782. if (galloc->buffers[buffer_id] == NULL) {
  783. return 0;
  784. }
  785. for (int i = 0; i < buffer_id; i++) {
  786. if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
  787. // this buffer is the same as a previous one due to the same buffer type being used multiple times
  788. // only return the buffer size the first time it appears to avoid double counting
  789. return 0;
  790. }
  791. }
  792. return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
  793. }
  794. // utils
  795. static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
  796. for (size_t i = 0; i < *n_buffers; i++) {
  797. ggml_backend_buffer_free((*buffers)[i]);
  798. }
  799. free(*buffers);
  800. }
  801. static bool alloc_tensor_range(struct ggml_context * ctx,
  802. struct ggml_tensor * first, struct ggml_tensor * last,
  803. ggml_backend_buffer_type_t buft, size_t size,
  804. ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
  805. ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
  806. if (buffer == NULL) {
  807. GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
  808. free_buffers(buffers, n_buffers);
  809. return false;
  810. }
  811. *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
  812. (*buffers)[(*n_buffers)++] = buffer;
  813. struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
  814. for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
  815. enum ggml_status status = GGML_STATUS_SUCCESS;
  816. if (t->data == NULL) {
  817. if (t->view_src == NULL) {
  818. status = ggml_tallocr_alloc(&tallocr, t);
  819. } else if (t->buffer == NULL) {
  820. status = ggml_backend_view_init(t);
  821. }
  822. } else {
  823. if (t->view_src != NULL && t->buffer == NULL) {
  824. // view of a pre-allocated tensor
  825. status = ggml_backend_view_init(t);
  826. }
  827. }
  828. if (status != GGML_STATUS_SUCCESS) {
  829. GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
  830. free_buffers(buffers, n_buffers);
  831. return false;
  832. }
  833. }
  834. return true;
  835. }
  836. ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
  837. GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
  838. size_t alignment = ggml_backend_buft_get_alignment(buft);
  839. size_t max_size = ggml_backend_buft_get_max_size(buft);
  840. ggml_backend_buffer_t * buffers = NULL;
  841. size_t n_buffers = 0;
  842. size_t cur_buf_size = 0;
  843. struct ggml_tensor * first = ggml_get_first_tensor(ctx);
  844. for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  845. size_t this_size = 0;
  846. if (t->data == NULL && t->view_src == NULL) {
  847. this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
  848. }
  849. if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
  850. // allocate tensors in the current buffer
  851. if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
  852. return NULL;
  853. }
  854. first = t;
  855. cur_buf_size = this_size;
  856. } else {
  857. cur_buf_size += this_size;
  858. }
  859. }
  860. // allocate remaining tensors
  861. if (cur_buf_size > 0) {
  862. if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
  863. return NULL;
  864. }
  865. }
  866. if (n_buffers == 0) {
  867. #ifndef NDEBUG
  868. GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
  869. #endif
  870. return NULL;
  871. }
  872. ggml_backend_buffer_t buffer;
  873. if (n_buffers == 1) {
  874. buffer = buffers[0];
  875. } else {
  876. buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
  877. }
  878. free(buffers);
  879. return buffer;
  880. }
  881. ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
  882. return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
  883. }