ggml-alloc.c 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036
  1. #include "ggml-alloc.h"
  2. #include "ggml-backend-impl.h"
  3. #include "ggml.h"
  4. #include "ggml-impl.h"
  5. #include <assert.h>
  6. #include <limits.h>
  7. #include <stdarg.h>
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  12. #define MAX_FREE_BLOCKS 256
  13. //#define GGML_ALLOCATOR_DEBUG
  14. //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
  15. #define AT_PRINTF(...)
  16. static bool ggml_is_view(const struct ggml_tensor * t) {
  17. return t->view_src != NULL;
  18. }
  19. static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
  20. if (a->type != b->type) {
  21. return false;
  22. }
  23. for (int i = 0; i < GGML_MAX_DIMS; i++) {
  24. if (a->ne[i] != b->ne[i]) {
  25. return false;
  26. }
  27. if (a->nb[i] != b->nb[i]) {
  28. return false;
  29. }
  30. }
  31. return true;
  32. }
  33. static bool ggml_op_can_inplace(enum ggml_op op) {
  34. switch (op) {
  35. case GGML_OP_SCALE:
  36. case GGML_OP_DIAG_MASK_ZERO:
  37. case GGML_OP_DIAG_MASK_INF:
  38. case GGML_OP_ADD:
  39. case GGML_OP_ADD1:
  40. case GGML_OP_SUB:
  41. case GGML_OP_MUL:
  42. case GGML_OP_DIV:
  43. case GGML_OP_SQR:
  44. case GGML_OP_SQRT:
  45. case GGML_OP_LOG:
  46. case GGML_OP_UNARY:
  47. case GGML_OP_ROPE:
  48. case GGML_OP_RMS_NORM:
  49. case GGML_OP_SOFT_MAX:
  50. return true;
  51. default:
  52. return false;
  53. }
  54. }
  55. static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
  56. assert(alignment && !(alignment & (alignment - 1))); // power of 2
  57. size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
  58. return offset + align;
  59. }
  60. // tallocr
  61. struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
  62. void * base = ggml_backend_buffer_get_base(buffer);
  63. size_t align = ggml_backend_buffer_get_alignment(buffer);
  64. assert(align && !(align & (align - 1))); // power of 2
  65. struct ggml_tallocr talloc = (struct ggml_tallocr) {
  66. /*.buffer = */ buffer,
  67. /*.base = */ base,
  68. /*.alignment = */ align,
  69. /*.offset = */ aligned_offset(base, 0, align),
  70. };
  71. return talloc;
  72. }
  73. void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
  74. size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
  75. size = GGML_PAD(size, talloc->alignment);
  76. if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
  77. fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
  78. __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
  79. GGML_ABORT("not enough space in the buffer");
  80. }
  81. void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
  82. talloc->offset += size;
  83. assert(((uintptr_t)addr % talloc->alignment) == 0);
  84. ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
  85. }
  86. // dynamic tensor allocator
  87. struct free_block {
  88. size_t offset;
  89. size_t size;
  90. };
  91. struct ggml_dyn_tallocr {
  92. size_t alignment;
  93. int n_free_blocks;
  94. struct free_block free_blocks[MAX_FREE_BLOCKS];
  95. size_t max_size;
  96. #ifdef GGML_ALLOCATOR_DEBUG
  97. struct {
  98. const struct ggml_tensor * tensor;
  99. size_t offset;
  100. } allocated_tensors[1024];
  101. #endif
  102. };
  103. #ifdef GGML_ALLOCATOR_DEBUG
  104. static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
  105. for (int i = 0; i < 1024; i++) {
  106. if (alloc->allocated_tensors[i].tensor == NULL) {
  107. alloc->allocated_tensors[i].tensor = tensor;
  108. alloc->allocated_tensors[i].offset = offset;
  109. return;
  110. }
  111. }
  112. GGML_ABORT("out of allocated_tensors");
  113. }
  114. static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
  115. for (int i = 0; i < 1024; i++) {
  116. if (alloc->allocated_tensors[i].offset == offset) {
  117. alloc->allocated_tensors[i].tensor = NULL;
  118. return;
  119. }
  120. }
  121. GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
  122. }
  123. #endif
  124. static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
  125. size = aligned_offset(NULL, size, alloc->alignment);
  126. AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
  127. size_t max_avail = 0;
  128. // find the best fitting free block besides the last block
  129. int best_fit_block = -1;
  130. size_t best_fit_size = SIZE_MAX;
  131. for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
  132. struct free_block * block = &alloc->free_blocks[i];
  133. max_avail = MAX(max_avail, block->size);
  134. if (block->size >= size && block->size <= best_fit_size) {
  135. best_fit_block = i;
  136. best_fit_size = block->size;
  137. }
  138. }
  139. if (best_fit_block == -1) {
  140. // the last block is our last resort
  141. struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
  142. max_avail = MAX(max_avail, block->size);
  143. if (block->size >= size) {
  144. best_fit_block = alloc->n_free_blocks - 1;
  145. } else {
  146. // this should never happen
  147. fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
  148. __func__, size, max_avail);
  149. GGML_ABORT("not enough space in the buffer");
  150. }
  151. }
  152. struct free_block * block = &alloc->free_blocks[best_fit_block];
  153. size_t offset = block->offset;
  154. block->offset = offset + size;
  155. block->size -= size;
  156. if (block->size == 0) {
  157. // remove block if empty
  158. alloc->n_free_blocks--;
  159. for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
  160. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  161. }
  162. }
  163. AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
  164. #ifdef GGML_ALLOCATOR_DEBUG
  165. add_allocated_tensor(alloc, offset, tensor);
  166. size_t cur_max = offset + size;
  167. if (cur_max > alloc->max_size) {
  168. // sort allocated_tensors by offset
  169. for (int i = 0; i < 1024; i++) {
  170. for (int j = i + 1; j < 1024; j++) {
  171. if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
  172. const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
  173. size_t tmp_offset = alloc->allocated_tensors[i].offset;
  174. alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
  175. alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
  176. alloc->allocated_tensors[j].tensor = tmp_tensor;
  177. alloc->allocated_tensors[j].offset = tmp_offset;
  178. }
  179. }
  180. }
  181. fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
  182. for (int i = 0; i < 1024; i++) {
  183. if (alloc->allocated_tensors[i].tensor) {
  184. fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
  185. alloc->allocated_tensors[i].offset,
  186. alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
  187. ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
  188. }
  189. }
  190. fprintf(stderr, "\n");
  191. }
  192. #endif
  193. alloc->max_size = MAX(alloc->max_size, offset + size);
  194. return offset;
  195. GGML_UNUSED(tensor);
  196. }
  197. // this is a very naive implementation, but for our case the number of free blocks should be very small
  198. static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
  199. size = aligned_offset(NULL, size, alloc->alignment);
  200. AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
  201. #ifdef GGML_ALLOCATOR_DEBUG
  202. remove_allocated_tensor(alloc, offset, tensor);
  203. #endif
  204. // see if we can merge with an existing block
  205. for (int i = 0; i < alloc->n_free_blocks; i++) {
  206. struct free_block * block = &alloc->free_blocks[i];
  207. // check if ptr is at the end of the block
  208. if (block->offset + block->size == offset) {
  209. block->size += size;
  210. // check if we can merge with the next block
  211. if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
  212. block->size += alloc->free_blocks[i+1].size;
  213. alloc->n_free_blocks--;
  214. for (int j = i+1; j < alloc->n_free_blocks; j++) {
  215. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  216. }
  217. }
  218. return;
  219. }
  220. // check if ptr is at the beginning of the block
  221. if (offset + size == block->offset) {
  222. block->offset = offset;
  223. block->size += size;
  224. // check if we can merge with the previous block
  225. if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
  226. alloc->free_blocks[i-1].size += block->size;
  227. alloc->n_free_blocks--;
  228. for (int j = i; j < alloc->n_free_blocks; j++) {
  229. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  230. }
  231. }
  232. return;
  233. }
  234. }
  235. // otherwise, add a new block
  236. GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
  237. // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
  238. int insert_pos = 0;
  239. while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
  240. insert_pos++;
  241. }
  242. // shift all blocks from insert_pos onward to make room for the new block
  243. for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
  244. alloc->free_blocks[i] = alloc->free_blocks[i-1];
  245. }
  246. // insert the new block
  247. alloc->free_blocks[insert_pos].offset = offset;
  248. alloc->free_blocks[insert_pos].size = size;
  249. alloc->n_free_blocks++;
  250. GGML_UNUSED(tensor);
  251. }
  252. static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
  253. alloc->n_free_blocks = 1;
  254. alloc->free_blocks[0].offset = 0;
  255. alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
  256. alloc->max_size = 0;
  257. }
  258. static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
  259. struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
  260. *alloc = (struct ggml_dyn_tallocr) {
  261. /*.alignment = */ alignment,
  262. /*.n_free_blocks = */ 0,
  263. /*.free_blocks = */ {{0}},
  264. /*.max_size = */ 0,
  265. #ifdef GGML_ALLOCATOR_DEBUG
  266. /*.allocated_tensors = */ {{0}},
  267. #endif
  268. };
  269. ggml_dyn_tallocr_reset(alloc);
  270. return alloc;
  271. }
  272. static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
  273. free(alloc);
  274. }
  275. static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
  276. return alloc->max_size;
  277. }
  278. /////////////////////////////////////
  279. // graph allocator
  280. struct hash_node {
  281. int n_children;
  282. int n_views;
  283. int buffer_id;
  284. size_t offset; // offset within the buffer
  285. bool allocated;
  286. };
  287. struct tensor_alloc {
  288. int buffer_id;
  289. size_t offset;
  290. size_t size_max; // 0 = pre-allocated, unused, or view
  291. };
  292. struct leaf_alloc {
  293. int buffer_id;
  294. struct tensor_alloc leaf;
  295. };
  296. struct node_alloc {
  297. struct tensor_alloc dst;
  298. struct tensor_alloc src[GGML_MAX_SRC];
  299. };
  300. struct ggml_gallocr {
  301. ggml_backend_buffer_type_t * bufts; // [n_buffers]
  302. ggml_backend_buffer_t * buffers; // [n_buffers]
  303. struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
  304. int n_buffers;
  305. struct ggml_hash_set hash_set;
  306. struct hash_node * hash_values; // [hash_set.size]
  307. struct node_alloc * node_allocs; // [n_nodes]
  308. int n_nodes;
  309. struct leaf_alloc * leaf_allocs; // [n_leafs]
  310. int n_leafs;
  311. };
  312. ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
  313. ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
  314. GGML_ASSERT(galloc != NULL);
  315. galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
  316. GGML_ASSERT(galloc->bufts != NULL);
  317. galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
  318. GGML_ASSERT(galloc->buffers != NULL);
  319. galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
  320. GGML_ASSERT(galloc->buf_tallocs != NULL);
  321. for (int i = 0; i < n_bufs; i++) {
  322. galloc->bufts[i] = bufts[i];
  323. galloc->buffers[i] = NULL;
  324. // check if the same buffer type is used multiple times and reuse the same allocator
  325. for (int j = 0; j < i; j++) {
  326. if (bufts[i] == bufts[j]) {
  327. galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
  328. break;
  329. }
  330. }
  331. if (galloc->buf_tallocs[i] == NULL) {
  332. size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
  333. galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
  334. }
  335. }
  336. galloc->n_buffers = n_bufs;
  337. return galloc;
  338. }
  339. ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
  340. return ggml_gallocr_new_n(&buft, 1);
  341. }
  342. void ggml_gallocr_free(ggml_gallocr_t galloc) {
  343. if (galloc == NULL) {
  344. return;
  345. }
  346. for (int i = 0; i < galloc->n_buffers; i++) {
  347. if (galloc->buffers != NULL) {
  348. // skip if already freed
  349. bool freed = false;
  350. for (int j = 0; j < i; j++) {
  351. if (galloc->buffers[j] == galloc->buffers[i]) {
  352. freed = true;
  353. break;
  354. }
  355. }
  356. if (!freed) {
  357. ggml_backend_buffer_free(galloc->buffers[i]);
  358. }
  359. }
  360. if (galloc->buf_tallocs != NULL) {
  361. // skip if already freed
  362. bool freed = false;
  363. for (int j = 0; j < i; j++) {
  364. if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
  365. freed = true;
  366. break;
  367. }
  368. }
  369. if (!freed) {
  370. ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
  371. }
  372. }
  373. }
  374. ggml_hash_set_free(&galloc->hash_set);
  375. free(galloc->hash_values);
  376. free(galloc->bufts);
  377. free(galloc->buffers);
  378. free(galloc->buf_tallocs);
  379. free(galloc->node_allocs);
  380. free(galloc->leaf_allocs);
  381. free(galloc);
  382. }
  383. typedef struct ggml_gallocr * ggml_gallocr_t;
  384. static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  385. size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
  386. return &galloc->hash_values[i];
  387. }
  388. static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  389. return ggml_gallocr_hash_get(galloc, t)->allocated;
  390. }
  391. static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
  392. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  393. hn->buffer_id = buffer_id;
  394. hn->offset = offset;
  395. hn->allocated = true;
  396. }
  397. static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  398. return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
  399. }
  400. static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
  401. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  402. if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
  403. hn->allocated = true;
  404. assert(hn->offset == 0);
  405. // try to reuse a parent's buffer (inplace)
  406. if (ggml_op_can_inplace(node->op)) {
  407. for (int i = 0; i < GGML_MAX_SRC; i++) {
  408. struct ggml_tensor * parent = node->src[i];
  409. if (parent == NULL) {
  410. continue;
  411. }
  412. // if the node's data is external, then we cannot re-use it
  413. if (!ggml_gallocr_is_own(galloc, parent)) {
  414. AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
  415. continue;
  416. }
  417. // outputs cannot be reused
  418. if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
  419. AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
  420. continue;
  421. }
  422. if (!ggml_are_same_layout(node, parent)) {
  423. AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
  424. continue;
  425. }
  426. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  427. if (p_hn->n_children == 1 && p_hn->n_views == 0) {
  428. if (ggml_is_view(parent)) {
  429. struct ggml_tensor * view_src = parent->view_src;
  430. struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
  431. if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
  432. AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
  433. assert(view_src_hn->offset == p_hn->offset);
  434. hn->buffer_id = p_hn->buffer_id;
  435. hn->offset = p_hn->offset;
  436. p_hn->allocated = false; // avoid freeing the parent
  437. view_src_hn->allocated = false;
  438. return;
  439. }
  440. } else {
  441. AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
  442. hn->buffer_id = p_hn->buffer_id;
  443. hn->offset = p_hn->offset;
  444. p_hn->allocated = false; // avoid freeing the parent
  445. return;
  446. }
  447. }
  448. }
  449. }
  450. // allocate tensor from the buffer
  451. struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
  452. ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
  453. size_t size = ggml_backend_buft_get_alloc_size(buft, node);
  454. size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
  455. hn->buffer_id = buffer_id;
  456. hn->offset = offset;
  457. return;
  458. }
  459. }
  460. static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
  461. // graph outputs are never freed
  462. if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
  463. AT_PRINTF("not freeing output %s\n", node->name);
  464. return;
  465. }
  466. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  467. size_t offset = hn->offset;
  468. int buffer_id = hn->buffer_id;
  469. struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
  470. ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
  471. size_t size = ggml_backend_buft_get_alloc_size(buft, node);
  472. ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
  473. hn->allocated = false;
  474. }
  475. static int get_node_buffer_id(const int * node_buffer_ids, int i) {
  476. return node_buffer_ids ? node_buffer_ids[i] : 0;
  477. }
  478. static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
  479. // clear hash tables
  480. ggml_hash_set_reset(&galloc->hash_set);
  481. memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
  482. // allocate leafs
  483. // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
  484. for (int i = 0; i < graph->n_leafs; i++) {
  485. struct ggml_tensor * leaf = graph->leafs[i];
  486. ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
  487. }
  488. // count number of children and views
  489. // allocate other graph inputs and leafs first to avoid overwriting them
  490. for (int i = 0; i < graph->n_nodes; i++) {
  491. struct ggml_tensor * node = graph->nodes[i];
  492. // TODO: better way to add external dependencies
  493. // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
  494. // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
  495. // itself is never used and should not be considered a dependency
  496. if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
  497. struct ggml_tensor * view_src = node->view_src;
  498. ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
  499. }
  500. if (node->flags & GGML_TENSOR_FLAG_INPUT) {
  501. ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
  502. }
  503. for (int j = 0; j < GGML_MAX_SRC; j++) {
  504. struct ggml_tensor * src = node->src[j];
  505. if (src == NULL) {
  506. continue;
  507. }
  508. ggml_gallocr_hash_get(galloc, src)->n_children += 1;
  509. // allocate explicit inputs
  510. if (src->flags & GGML_TENSOR_FLAG_INPUT) {
  511. ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
  512. }
  513. }
  514. }
  515. // allocate tensors
  516. for (int i = 0; i < graph->n_nodes; i++) {
  517. struct ggml_tensor * node = graph->nodes[i];
  518. int buffer_id = get_node_buffer_id(node_buffer_ids, i);
  519. // allocate parents (only leafs need to be allocated at this point)
  520. for (int j = 0; j < GGML_MAX_SRC; j++) {
  521. struct ggml_tensor * parent = node->src[j];
  522. if (parent == NULL) {
  523. continue;
  524. }
  525. ggml_gallocr_allocate_node(galloc, parent, buffer_id);
  526. }
  527. // allocate node
  528. ggml_gallocr_allocate_node(galloc, node, buffer_id);
  529. AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
  530. for (int j = 0; j < GGML_MAX_SRC; j++) {
  531. struct ggml_tensor * parent = node->src[j];
  532. if (parent == NULL) {
  533. continue;
  534. }
  535. AT_PRINTF("%s", parent->name);
  536. if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
  537. AT_PRINTF(", ");
  538. }
  539. }
  540. AT_PRINTF("\n");
  541. // update parents
  542. for (int j = 0; j < GGML_MAX_SRC; j++) {
  543. struct ggml_tensor * parent = node->src[j];
  544. if (parent == NULL) {
  545. continue;
  546. }
  547. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  548. p_hn->n_children -= 1;
  549. AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
  550. parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
  551. if (p_hn->n_children == 0 && p_hn->n_views == 0) {
  552. if (ggml_is_view(parent)) {
  553. struct ggml_tensor * view_src = parent->view_src;
  554. struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
  555. view_src_hn->n_views -= 1;
  556. AT_PRINTF("view_src %s: %d children, %d views\n",
  557. view_src->name, view_src_hn->n_children, view_src_hn->n_views);
  558. if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
  559. ggml_gallocr_free_node(galloc, view_src);
  560. }
  561. }
  562. else if (p_hn->allocated) {
  563. ggml_gallocr_free_node(galloc, parent);
  564. }
  565. }
  566. AT_PRINTF("\n");
  567. }
  568. }
  569. }
  570. bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
  571. size_t min_hash_size = graph->n_nodes + graph->n_leafs;
  572. // add 25% margin to avoid hash collisions
  573. min_hash_size += min_hash_size / 4;
  574. // initialize hash table
  575. if (galloc->hash_set.size < min_hash_size) {
  576. ggml_hash_set_free(&galloc->hash_set);
  577. galloc->hash_set = ggml_hash_set_new(min_hash_size);
  578. GGML_ASSERT(galloc->hash_set.keys != NULL);
  579. free(galloc->hash_values);
  580. galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
  581. GGML_ASSERT(galloc->hash_values != NULL);
  582. }
  583. // reset allocators
  584. for (int i = 0; i < galloc->n_buffers; i++) {
  585. ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
  586. }
  587. // allocate in hash table
  588. ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
  589. // set the node_allocs from the hash table
  590. if (galloc->n_nodes < graph->n_nodes) {
  591. free(galloc->node_allocs);
  592. galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
  593. GGML_ASSERT(galloc->node_allocs != NULL);
  594. }
  595. galloc->n_nodes = graph->n_nodes;
  596. for (int i = 0; i < graph->n_nodes; i++) {
  597. struct ggml_tensor * node = graph->nodes[i];
  598. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  599. if (node->view_src || node->data) {
  600. node_alloc->dst.buffer_id = -1;
  601. node_alloc->dst.offset = SIZE_MAX;
  602. node_alloc->dst.size_max = 0;
  603. } else {
  604. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  605. node_alloc->dst.buffer_id = hn->buffer_id;
  606. node_alloc->dst.offset = hn->offset;
  607. node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
  608. }
  609. for (int j = 0; j < GGML_MAX_SRC; j++) {
  610. struct ggml_tensor * src = node->src[j];
  611. if (!src || src->view_src || src->data) {
  612. node_alloc->src[j].buffer_id = -1;
  613. node_alloc->src[j].offset = SIZE_MAX;
  614. node_alloc->src[j].size_max = 0;
  615. } else {
  616. struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
  617. node_alloc->src[j].buffer_id = hn->buffer_id;
  618. node_alloc->src[j].offset = hn->offset;
  619. node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
  620. }
  621. }
  622. }
  623. if (galloc->n_leafs < graph->n_leafs) {
  624. free(galloc->leaf_allocs);
  625. galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
  626. GGML_ASSERT(galloc->leaf_allocs != NULL);
  627. }
  628. galloc->n_leafs = graph->n_leafs;
  629. for (int i = 0; i < graph->n_leafs; i++) {
  630. struct ggml_tensor * leaf = graph->leafs[i];
  631. struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
  632. galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
  633. if (leaf->view_src || leaf->data) {
  634. galloc->leaf_allocs[i].leaf.buffer_id = -1;
  635. galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
  636. galloc->leaf_allocs[i].leaf.size_max = 0;
  637. } else {
  638. galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
  639. galloc->leaf_allocs[i].leaf.offset = hn->offset;
  640. galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
  641. }
  642. }
  643. // reallocate buffers if needed
  644. for (int i = 0; i < galloc->n_buffers; i++) {
  645. // if the buffer type is used multiple times, we reuse the same buffer
  646. for (int j = 0; j < i; j++) {
  647. if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
  648. galloc->buffers[i] = galloc->buffers[j];
  649. break;
  650. }
  651. }
  652. size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
  653. size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
  654. // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
  655. if (new_size > cur_size || galloc->buffers[i] == NULL) {
  656. #ifndef NDEBUG
  657. fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  658. #endif
  659. ggml_backend_buffer_free(galloc->buffers[i]);
  660. galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
  661. if (galloc->buffers[i] == NULL) {
  662. fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
  663. return false;
  664. }
  665. ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
  666. }
  667. }
  668. return true;
  669. }
  670. bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
  671. return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
  672. }
  673. static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
  674. int buffer_id = tensor_alloc->buffer_id;
  675. assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
  676. if (tensor->view_src != NULL) {
  677. if (tensor->buffer == NULL) {
  678. assert(tensor_alloc->offset == SIZE_MAX);
  679. if (tensor->view_src->buffer == NULL) {
  680. // this tensor was allocated without ggml-backend
  681. return;
  682. }
  683. ggml_backend_view_init(tensor);
  684. }
  685. } else {
  686. if (tensor->data == NULL) {
  687. assert(tensor_alloc->offset != SIZE_MAX);
  688. assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
  689. void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
  690. void * addr = (char *)base + tensor_alloc->offset;
  691. ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
  692. } else {
  693. if (tensor->buffer == NULL) {
  694. // this tensor was allocated without ggml-backend
  695. return;
  696. }
  697. }
  698. }
  699. }
  700. static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
  701. size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
  702. return talloc->size_max >= node_size;
  703. }
  704. static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
  705. if (galloc->n_nodes != graph->n_nodes) {
  706. #ifndef NDEBUG
  707. fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
  708. #endif
  709. return true;
  710. }
  711. if (galloc->n_leafs != graph->n_leafs) {
  712. #ifndef NDEBUG
  713. fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
  714. #endif
  715. return true;
  716. }
  717. for (int i = 0; i < graph->n_nodes; i++) {
  718. struct ggml_tensor * node = graph->nodes[i];
  719. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  720. if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
  721. #ifndef NDEBUG
  722. fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
  723. #endif
  724. return true;
  725. }
  726. for (int j = 0; j < GGML_MAX_SRC; j++) {
  727. struct ggml_tensor * src = node->src[j];
  728. if (src == NULL) {
  729. continue;
  730. }
  731. if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
  732. #ifndef NDEBUG
  733. fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
  734. #endif
  735. return true;
  736. }
  737. }
  738. }
  739. return false;
  740. }
  741. bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
  742. if (ggml_gallocr_needs_realloc(galloc, graph)) {
  743. if (galloc->n_buffers == 1) {
  744. #ifndef NDEBUG
  745. fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
  746. #endif
  747. if (!ggml_gallocr_reserve(galloc, graph)) {
  748. return false;
  749. }
  750. } else {
  751. #ifndef NDEBUG
  752. fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
  753. #endif
  754. return false;
  755. }
  756. }
  757. // reset buffers
  758. for (int i = 0; i < galloc->n_buffers; i++) {
  759. if (galloc->buffers[i] != NULL) {
  760. ggml_backend_buffer_reset(galloc->buffers[i]);
  761. }
  762. }
  763. // allocate the graph tensors from the previous assignments
  764. // leafs
  765. for (int i = 0; i < graph->n_leafs; i++) {
  766. struct ggml_tensor * leaf = graph->leafs[i];
  767. struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
  768. ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
  769. }
  770. // nodes
  771. for (int i = 0; i < graph->n_nodes; i++) {
  772. struct ggml_tensor * node = graph->nodes[i];
  773. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  774. for (int j = 0; j < GGML_MAX_SRC; j++) {
  775. struct ggml_tensor * src = node->src[j];
  776. if (src == NULL) {
  777. continue;
  778. }
  779. ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
  780. }
  781. ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
  782. }
  783. return true;
  784. }
  785. size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
  786. GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
  787. if (galloc->buffers[buffer_id] == NULL) {
  788. return 0;
  789. }
  790. for (int i = 0; i < buffer_id; i++) {
  791. if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
  792. // this buffer is the same as a previous one due to the same buffer type being used multiple times
  793. // only return the buffer size the first time it appears to avoid double counting
  794. return 0;
  795. }
  796. }
  797. return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
  798. }
  799. // utils
  800. static bool alloc_tensor_range(struct ggml_context * ctx,
  801. struct ggml_tensor * first, struct ggml_tensor * last,
  802. ggml_backend_buffer_type_t buft, size_t size,
  803. ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
  804. ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
  805. if (buffer == NULL) {
  806. #ifndef NDEBUG
  807. fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
  808. #endif
  809. for (size_t i = 0; i < *n_buffers; i++) {
  810. ggml_backend_buffer_free((*buffers)[i]);
  811. }
  812. free(*buffers);
  813. return false;
  814. }
  815. struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
  816. for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
  817. if (t->data == NULL) {
  818. if (t->view_src == NULL) {
  819. ggml_tallocr_alloc(&tallocr, t);
  820. } else if (t->buffer == NULL) {
  821. ggml_backend_view_init(t);
  822. }
  823. } else {
  824. if (t->view_src != NULL && t->buffer == NULL) {
  825. // view of a pre-allocated tensor
  826. ggml_backend_view_init(t);
  827. }
  828. }
  829. }
  830. *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
  831. (*buffers)[(*n_buffers)++] = buffer;
  832. return true;
  833. }
  834. ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
  835. GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
  836. size_t alignment = ggml_backend_buft_get_alignment(buft);
  837. size_t max_size = ggml_backend_buft_get_max_size(buft);
  838. ggml_backend_buffer_t * buffers = NULL;
  839. size_t n_buffers = 0;
  840. size_t cur_buf_size = 0;
  841. struct ggml_tensor * first = ggml_get_first_tensor(ctx);
  842. for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  843. size_t this_size = 0;
  844. if (t->data == NULL && t->view_src == NULL) {
  845. this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
  846. }
  847. if (this_size > max_size) {
  848. fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
  849. __func__, t->name,
  850. ggml_backend_buft_name(buft),
  851. this_size, max_size);
  852. for (size_t i = 0; i < n_buffers; i++) {
  853. ggml_backend_buffer_free(buffers[i]);
  854. }
  855. free(buffers);
  856. return NULL;
  857. }
  858. if ((cur_buf_size + this_size) > max_size) {
  859. // allocate tensors in the current buffer
  860. if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
  861. return NULL;
  862. }
  863. first = t;
  864. cur_buf_size = this_size;
  865. } else {
  866. cur_buf_size += this_size;
  867. }
  868. }
  869. // allocate remaining tensors
  870. if (cur_buf_size > 0) {
  871. if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
  872. return NULL;
  873. }
  874. }
  875. if (n_buffers == 0) {
  876. #ifndef NDEBUG
  877. fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
  878. #endif
  879. return NULL;
  880. }
  881. ggml_backend_buffer_t buffer;
  882. if (n_buffers == 1) {
  883. buffer = buffers[0];
  884. } else {
  885. buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
  886. }
  887. free(buffers);
  888. return buffer;
  889. }
  890. ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
  891. return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
  892. }