ggml-alloc.c 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. #include "ggml-alloc.h"
  2. #include "ggml-backend-impl.h"
  3. #include "ggml.h"
  4. #include "ggml-impl.h"
  5. #include <assert.h>
  6. #include <limits.h>
  7. #include <stdarg.h>
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  12. #define MAX_FREE_BLOCKS 256
  13. //#define GGML_ALLOCATOR_DEBUG
  14. //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
  15. #define AT_PRINTF(...)
  16. static bool ggml_is_view(const struct ggml_tensor * t) {
  17. return t->view_src != NULL;
  18. }
  19. // ops that return true for this function must not use restrict pointers for their backend implementations
  20. static bool ggml_op_can_inplace(enum ggml_op op) {
  21. switch (op) {
  22. case GGML_OP_SCALE:
  23. case GGML_OP_DIAG_MASK_ZERO:
  24. case GGML_OP_DIAG_MASK_INF:
  25. case GGML_OP_ADD:
  26. case GGML_OP_ADD_ID:
  27. case GGML_OP_ADD1:
  28. case GGML_OP_SUB:
  29. case GGML_OP_MUL:
  30. case GGML_OP_DIV:
  31. case GGML_OP_SQR:
  32. case GGML_OP_SQRT:
  33. case GGML_OP_LOG:
  34. case GGML_OP_UNARY:
  35. case GGML_OP_ROPE:
  36. case GGML_OP_ROPE_BACK:
  37. case GGML_OP_SILU_BACK:
  38. case GGML_OP_RMS_NORM:
  39. case GGML_OP_RMS_NORM_BACK:
  40. case GGML_OP_SOFT_MAX:
  41. case GGML_OP_SOFT_MAX_BACK:
  42. return true;
  43. default:
  44. return false;
  45. }
  46. }
  47. static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
  48. assert(alignment && !(alignment & (alignment - 1))); // power of 2
  49. size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
  50. return offset + align;
  51. }
  52. // tallocr
  53. struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
  54. void * base = ggml_backend_buffer_get_base(buffer);
  55. size_t align = ggml_backend_buffer_get_alignment(buffer);
  56. assert(align && !(align & (align - 1))); // power of 2
  57. struct ggml_tallocr talloc = (struct ggml_tallocr) {
  58. /*.buffer = */ buffer,
  59. /*.base = */ base,
  60. /*.alignment = */ align,
  61. /*.offset = */ aligned_offset(base, 0, align),
  62. };
  63. return talloc;
  64. }
  65. enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
  66. size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
  67. size = GGML_PAD(size, talloc->alignment);
  68. if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
  69. GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
  70. __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
  71. GGML_ABORT("not enough space in the buffer");
  72. }
  73. void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
  74. talloc->offset += size;
  75. assert(((uintptr_t)addr % talloc->alignment) == 0);
  76. return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
  77. }
  78. // dynamic tensor allocator
  79. struct free_block {
  80. size_t offset;
  81. size_t size;
  82. };
  83. struct ggml_dyn_tallocr {
  84. size_t alignment;
  85. int n_free_blocks;
  86. struct free_block free_blocks[MAX_FREE_BLOCKS];
  87. size_t max_size;
  88. #ifdef GGML_ALLOCATOR_DEBUG
  89. struct {
  90. const struct ggml_tensor * tensor;
  91. size_t offset;
  92. } allocated_tensors[1024];
  93. #endif
  94. };
  95. #ifdef GGML_ALLOCATOR_DEBUG
  96. static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
  97. for (int i = 0; i < 1024; i++) {
  98. if (alloc->allocated_tensors[i].tensor == NULL) {
  99. alloc->allocated_tensors[i].tensor = tensor;
  100. alloc->allocated_tensors[i].offset = offset;
  101. return;
  102. }
  103. }
  104. GGML_ABORT("out of allocated_tensors");
  105. }
  106. static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
  107. for (int i = 0; i < 1024; i++) {
  108. if (alloc->allocated_tensors[i].offset == offset) {
  109. alloc->allocated_tensors[i].tensor = NULL;
  110. return;
  111. }
  112. }
  113. GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
  114. }
  115. #endif
  116. static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
  117. size = aligned_offset(NULL, size, alloc->alignment);
  118. AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
  119. size_t max_avail = 0;
  120. // find the best fitting free block besides the last block
  121. int best_fit_block = -1;
  122. size_t best_fit_size = SIZE_MAX;
  123. for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
  124. struct free_block * block = &alloc->free_blocks[i];
  125. max_avail = MAX(max_avail, block->size);
  126. if (block->size >= size && block->size <= best_fit_size) {
  127. best_fit_block = i;
  128. best_fit_size = block->size;
  129. }
  130. }
  131. if (best_fit_block == -1) {
  132. // the last block is our last resort
  133. struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
  134. max_avail = MAX(max_avail, block->size);
  135. if (block->size >= size) {
  136. best_fit_block = alloc->n_free_blocks - 1;
  137. } else {
  138. // this should never happen
  139. GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
  140. __func__, size, max_avail);
  141. GGML_ABORT("not enough space in the buffer");
  142. }
  143. }
  144. struct free_block * block = &alloc->free_blocks[best_fit_block];
  145. size_t offset = block->offset;
  146. block->offset = offset + size;
  147. block->size -= size;
  148. if (block->size == 0) {
  149. // remove block if empty
  150. alloc->n_free_blocks--;
  151. for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
  152. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  153. }
  154. }
  155. AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
  156. #ifdef GGML_ALLOCATOR_DEBUG
  157. add_allocated_tensor(alloc, offset, tensor);
  158. size_t cur_max = offset + size;
  159. if (cur_max > alloc->max_size) {
  160. // sort allocated_tensors by offset
  161. for (int i = 0; i < 1024; i++) {
  162. for (int j = i + 1; j < 1024; j++) {
  163. if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
  164. const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
  165. size_t tmp_offset = alloc->allocated_tensors[i].offset;
  166. alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
  167. alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
  168. alloc->allocated_tensors[j].tensor = tmp_tensor;
  169. alloc->allocated_tensors[j].offset = tmp_offset;
  170. }
  171. }
  172. }
  173. GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
  174. for (int i = 0; i < 1024; i++) {
  175. if (alloc->allocated_tensors[i].tensor) {
  176. GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
  177. alloc->allocated_tensors[i].offset,
  178. alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
  179. ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
  180. }
  181. }
  182. GGML_LOG_DEBUG("\n");
  183. }
  184. #endif
  185. alloc->max_size = MAX(alloc->max_size, offset + size);
  186. return offset;
  187. GGML_UNUSED(tensor);
  188. }
  189. // this is a very naive implementation, but for our case the number of free blocks should be very small
  190. static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
  191. size = aligned_offset(NULL, size, alloc->alignment);
  192. AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
  193. #ifdef GGML_ALLOCATOR_DEBUG
  194. remove_allocated_tensor(alloc, offset, tensor);
  195. #endif
  196. // see if we can merge with an existing block
  197. for (int i = 0; i < alloc->n_free_blocks; i++) {
  198. struct free_block * block = &alloc->free_blocks[i];
  199. // check if ptr is at the end of the block
  200. if (block->offset + block->size == offset) {
  201. block->size += size;
  202. // check if we can merge with the next block
  203. if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
  204. block->size += alloc->free_blocks[i+1].size;
  205. alloc->n_free_blocks--;
  206. for (int j = i+1; j < alloc->n_free_blocks; j++) {
  207. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  208. }
  209. }
  210. return;
  211. }
  212. // check if ptr is at the beginning of the block
  213. if (offset + size == block->offset) {
  214. block->offset = offset;
  215. block->size += size;
  216. // check if we can merge with the previous block
  217. if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
  218. alloc->free_blocks[i-1].size += block->size;
  219. alloc->n_free_blocks--;
  220. for (int j = i; j < alloc->n_free_blocks; j++) {
  221. alloc->free_blocks[j] = alloc->free_blocks[j+1];
  222. }
  223. }
  224. return;
  225. }
  226. }
  227. // otherwise, add a new block
  228. GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
  229. // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
  230. int insert_pos = 0;
  231. while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
  232. insert_pos++;
  233. }
  234. // shift all blocks from insert_pos onward to make room for the new block
  235. for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
  236. alloc->free_blocks[i] = alloc->free_blocks[i-1];
  237. }
  238. // insert the new block
  239. alloc->free_blocks[insert_pos].offset = offset;
  240. alloc->free_blocks[insert_pos].size = size;
  241. alloc->n_free_blocks++;
  242. GGML_UNUSED(tensor);
  243. }
  244. static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
  245. alloc->n_free_blocks = 1;
  246. alloc->free_blocks[0].offset = 0;
  247. alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
  248. alloc->max_size = 0;
  249. #ifdef GGML_ALLOCATOR_DEBUG
  250. for (int i = 0; i < 1024; i++) {
  251. alloc->allocated_tensors[i].tensor = NULL;
  252. }
  253. #endif
  254. }
  255. static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
  256. struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
  257. *alloc = (struct ggml_dyn_tallocr) {
  258. /*.alignment = */ alignment,
  259. /*.n_free_blocks = */ 0,
  260. /*.free_blocks = */ {{0}},
  261. /*.max_size = */ 0,
  262. #ifdef GGML_ALLOCATOR_DEBUG
  263. /*.allocated_tensors = */ {{0}},
  264. #endif
  265. };
  266. ggml_dyn_tallocr_reset(alloc);
  267. return alloc;
  268. }
  269. static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
  270. free(alloc);
  271. }
  272. static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
  273. return alloc->max_size;
  274. }
  275. /////////////////////////////////////
  276. // graph allocator
  277. struct hash_node {
  278. int n_children;
  279. int n_views;
  280. int buffer_id;
  281. size_t offset; // offset within the buffer
  282. bool allocated;
  283. };
  284. struct tensor_alloc {
  285. int buffer_id;
  286. size_t offset;
  287. size_t size_max; // 0 = pre-allocated, unused, or view
  288. };
  289. struct leaf_alloc {
  290. struct tensor_alloc leaf;
  291. };
  292. struct node_alloc {
  293. struct tensor_alloc dst;
  294. struct tensor_alloc src[GGML_MAX_SRC];
  295. };
  296. struct ggml_gallocr {
  297. ggml_backend_buffer_type_t * bufts; // [n_buffers]
  298. ggml_backend_buffer_t * buffers; // [n_buffers]
  299. struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
  300. int n_buffers;
  301. struct ggml_hash_set hash_set;
  302. struct hash_node * hash_values; // [hash_set.size]
  303. struct node_alloc * node_allocs; // [n_nodes]
  304. int n_nodes;
  305. struct leaf_alloc * leaf_allocs; // [n_leafs]
  306. int n_leafs;
  307. };
  308. ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
  309. ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
  310. GGML_ASSERT(galloc != NULL);
  311. galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
  312. GGML_ASSERT(galloc->bufts != NULL);
  313. galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
  314. GGML_ASSERT(galloc->buffers != NULL);
  315. galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
  316. GGML_ASSERT(galloc->buf_tallocs != NULL);
  317. for (int i = 0; i < n_bufs; i++) {
  318. galloc->bufts[i] = bufts[i];
  319. galloc->buffers[i] = NULL;
  320. // check if the same buffer type is used multiple times and reuse the same allocator
  321. for (int j = 0; j < i; j++) {
  322. if (bufts[i] == bufts[j]) {
  323. galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
  324. break;
  325. }
  326. }
  327. if (galloc->buf_tallocs[i] == NULL) {
  328. size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
  329. galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
  330. }
  331. }
  332. galloc->n_buffers = n_bufs;
  333. return galloc;
  334. }
  335. ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
  336. return ggml_gallocr_new_n(&buft, 1);
  337. }
  338. void ggml_gallocr_free(ggml_gallocr_t galloc) {
  339. if (galloc == NULL) {
  340. return;
  341. }
  342. for (int i = 0; i < galloc->n_buffers; i++) {
  343. if (galloc->buffers != NULL) {
  344. // skip if already freed
  345. bool freed = false;
  346. for (int j = 0; j < i; j++) {
  347. if (galloc->buffers[j] == galloc->buffers[i]) {
  348. freed = true;
  349. break;
  350. }
  351. }
  352. if (!freed) {
  353. ggml_backend_buffer_free(galloc->buffers[i]);
  354. }
  355. }
  356. if (galloc->buf_tallocs != NULL) {
  357. // skip if already freed
  358. bool freed = false;
  359. for (int j = 0; j < i; j++) {
  360. if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
  361. freed = true;
  362. break;
  363. }
  364. }
  365. if (!freed) {
  366. ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
  367. }
  368. }
  369. }
  370. ggml_hash_set_free(&galloc->hash_set);
  371. free(galloc->hash_values);
  372. free(galloc->bufts);
  373. free(galloc->buffers);
  374. free(galloc->buf_tallocs);
  375. free(galloc->node_allocs);
  376. free(galloc->leaf_allocs);
  377. free(galloc);
  378. }
  379. typedef struct ggml_gallocr * ggml_gallocr_t;
  380. static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  381. size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
  382. return &galloc->hash_values[i];
  383. }
  384. static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  385. return ggml_gallocr_hash_get(galloc, t)->allocated;
  386. }
  387. static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  388. return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
  389. }
  390. static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
  391. GGML_ASSERT(buffer_id >= 0);
  392. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  393. if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
  394. hn->allocated = true;
  395. assert(hn->offset == 0);
  396. // try to reuse a parent's buffer (inplace)
  397. if (ggml_op_can_inplace(node->op)) {
  398. for (int i = 0; i < GGML_MAX_SRC; i++) {
  399. struct ggml_tensor * parent = node->src[i];
  400. if (parent == NULL) {
  401. continue;
  402. }
  403. // if the node's data is external, then we cannot re-use it
  404. if (!ggml_gallocr_is_own(galloc, parent)) {
  405. AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
  406. continue;
  407. }
  408. // outputs cannot be reused
  409. if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
  410. AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
  411. continue;
  412. }
  413. if (!ggml_are_same_layout(node, parent)) {
  414. AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
  415. continue;
  416. }
  417. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  418. if (p_hn->n_children == 1 && p_hn->n_views == 0) {
  419. if (ggml_is_view(parent)) {
  420. struct ggml_tensor * view_src = parent->view_src;
  421. struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
  422. if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
  423. AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
  424. assert(view_src_hn->offset == p_hn->offset);
  425. hn->buffer_id = p_hn->buffer_id;
  426. hn->offset = p_hn->offset;
  427. p_hn->allocated = false; // avoid freeing the parent
  428. view_src_hn->allocated = false;
  429. return;
  430. }
  431. } else {
  432. AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
  433. hn->buffer_id = p_hn->buffer_id;
  434. hn->offset = p_hn->offset;
  435. p_hn->allocated = false; // avoid freeing the parent
  436. return;
  437. }
  438. }
  439. }
  440. }
  441. // allocate tensor from the buffer
  442. struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
  443. ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
  444. size_t size = ggml_backend_buft_get_alloc_size(buft, node);
  445. size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
  446. hn->buffer_id = buffer_id;
  447. hn->offset = offset;
  448. }
  449. }
  450. static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
  451. // graph outputs are never freed
  452. if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
  453. AT_PRINTF("not freeing output %s\n", node->name);
  454. return;
  455. }
  456. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  457. size_t offset = hn->offset;
  458. int buffer_id = hn->buffer_id;
  459. struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
  460. ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
  461. size_t size = ggml_backend_buft_get_alloc_size(buft, node);
  462. ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
  463. hn->allocated = false;
  464. }
  465. static int get_node_buffer_id(const int * node_buffer_ids, int i) {
  466. return node_buffer_ids ? node_buffer_ids[i] : 0;
  467. }
  468. static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
  469. // clear hash tables
  470. ggml_hash_set_reset(&galloc->hash_set);
  471. memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
  472. // allocate leafs
  473. // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
  474. for (int i = 0; i < graph->n_leafs; i++) {
  475. struct ggml_tensor * leaf = graph->leafs[i];
  476. ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
  477. }
  478. // count number of children and views
  479. // allocate other graph inputs and leafs first to avoid overwriting them
  480. for (int i = 0; i < graph->n_nodes; i++) {
  481. struct ggml_tensor * node = graph->nodes[i];
  482. // TODO: better way to add external dependencies
  483. // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
  484. // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
  485. // itself is never used and should not be considered a dependency
  486. if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
  487. struct ggml_tensor * view_src = node->view_src;
  488. ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
  489. }
  490. if (node->flags & GGML_TENSOR_FLAG_INPUT) {
  491. ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
  492. }
  493. for (int j = 0; j < GGML_MAX_SRC; j++) {
  494. struct ggml_tensor * src = node->src[j];
  495. if (src == NULL) {
  496. continue;
  497. }
  498. ggml_gallocr_hash_get(galloc, src)->n_children += 1;
  499. // allocate explicit inputs
  500. if (src->flags & GGML_TENSOR_FLAG_INPUT) {
  501. ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
  502. }
  503. }
  504. }
  505. // allocate tensors
  506. for (int i = 0; i < graph->n_nodes; i++) {
  507. struct ggml_tensor * node = graph->nodes[i];
  508. int buffer_id = get_node_buffer_id(node_buffer_ids, i);
  509. // allocate parents (only leafs need to be allocated at this point)
  510. for (int j = 0; j < GGML_MAX_SRC; j++) {
  511. struct ggml_tensor * parent = node->src[j];
  512. if (parent == NULL) {
  513. continue;
  514. }
  515. ggml_gallocr_allocate_node(galloc, parent, buffer_id);
  516. }
  517. // allocate node
  518. ggml_gallocr_allocate_node(galloc, node, buffer_id);
  519. AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
  520. for (int j = 0; j < GGML_MAX_SRC; j++) {
  521. struct ggml_tensor * parent = node->src[j];
  522. if (parent == NULL) {
  523. continue;
  524. }
  525. AT_PRINTF("%s", parent->name);
  526. if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
  527. AT_PRINTF(", ");
  528. }
  529. }
  530. AT_PRINTF("\n");
  531. // update parents
  532. for (int j = 0; j < GGML_MAX_SRC; j++) {
  533. struct ggml_tensor * parent = node->src[j];
  534. if (parent == NULL) {
  535. continue;
  536. }
  537. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  538. p_hn->n_children -= 1;
  539. AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
  540. parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
  541. if (p_hn->n_children == 0 && p_hn->n_views == 0) {
  542. if (ggml_is_view(parent)) {
  543. struct ggml_tensor * view_src = parent->view_src;
  544. struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
  545. view_src_hn->n_views -= 1;
  546. AT_PRINTF("view_src %s: %d children, %d views\n",
  547. view_src->name, view_src_hn->n_children, view_src_hn->n_views);
  548. if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
  549. ggml_gallocr_free_node(galloc, view_src);
  550. }
  551. }
  552. else if (p_hn->allocated) {
  553. ggml_gallocr_free_node(galloc, parent);
  554. }
  555. }
  556. AT_PRINTF("\n");
  557. }
  558. }
  559. }
  560. bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
  561. size_t min_hash_size = graph->n_nodes + graph->n_leafs;
  562. // add 25% margin to avoid hash collisions
  563. min_hash_size += min_hash_size / 4;
  564. // initialize hash table
  565. if (galloc->hash_set.size < min_hash_size) {
  566. ggml_hash_set_free(&galloc->hash_set);
  567. galloc->hash_set = ggml_hash_set_new(min_hash_size);
  568. GGML_ASSERT(galloc->hash_set.keys != NULL);
  569. free(galloc->hash_values);
  570. galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
  571. GGML_ASSERT(galloc->hash_values != NULL);
  572. }
  573. // reset allocators
  574. for (int i = 0; i < galloc->n_buffers; i++) {
  575. ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
  576. }
  577. // allocate in hash table
  578. ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
  579. // set the node_allocs from the hash table
  580. if (galloc->n_nodes < graph->n_nodes) {
  581. free(galloc->node_allocs);
  582. galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
  583. GGML_ASSERT(galloc->node_allocs != NULL);
  584. }
  585. galloc->n_nodes = graph->n_nodes;
  586. for (int i = 0; i < graph->n_nodes; i++) {
  587. struct ggml_tensor * node = graph->nodes[i];
  588. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  589. if (node->view_src || node->data) {
  590. node_alloc->dst.buffer_id = -1;
  591. node_alloc->dst.offset = SIZE_MAX;
  592. node_alloc->dst.size_max = 0;
  593. } else {
  594. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  595. node_alloc->dst.buffer_id = hn->buffer_id;
  596. node_alloc->dst.offset = hn->offset;
  597. node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
  598. }
  599. for (int j = 0; j < GGML_MAX_SRC; j++) {
  600. struct ggml_tensor * src = node->src[j];
  601. if (!src || src->view_src || src->data) {
  602. node_alloc->src[j].buffer_id = -1;
  603. node_alloc->src[j].offset = SIZE_MAX;
  604. node_alloc->src[j].size_max = 0;
  605. } else {
  606. struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
  607. node_alloc->src[j].buffer_id = hn->buffer_id;
  608. node_alloc->src[j].offset = hn->offset;
  609. node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
  610. }
  611. }
  612. }
  613. if (galloc->n_leafs < graph->n_leafs) {
  614. free(galloc->leaf_allocs);
  615. galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
  616. GGML_ASSERT(galloc->leaf_allocs != NULL);
  617. }
  618. galloc->n_leafs = graph->n_leafs;
  619. for (int i = 0; i < graph->n_leafs; i++) {
  620. struct ggml_tensor * leaf = graph->leafs[i];
  621. struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
  622. if (leaf->view_src || leaf->data) {
  623. galloc->leaf_allocs[i].leaf.buffer_id = -1;
  624. galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
  625. galloc->leaf_allocs[i].leaf.size_max = 0;
  626. } else {
  627. galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
  628. galloc->leaf_allocs[i].leaf.offset = hn->offset;
  629. galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
  630. }
  631. }
  632. // reallocate buffers if needed
  633. for (int i = 0; i < galloc->n_buffers; i++) {
  634. // if the buffer type is used multiple times, we reuse the same buffer
  635. for (int j = 0; j < i; j++) {
  636. if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
  637. galloc->buffers[i] = galloc->buffers[j];
  638. break;
  639. }
  640. }
  641. size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
  642. size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
  643. // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
  644. if (new_size > cur_size || galloc->buffers[i] == NULL) {
  645. #ifndef NDEBUG
  646. GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  647. #endif
  648. ggml_backend_buffer_free(galloc->buffers[i]);
  649. galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
  650. if (galloc->buffers[i] == NULL) {
  651. GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
  652. return false;
  653. }
  654. ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
  655. }
  656. }
  657. return true;
  658. }
  659. bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
  660. return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
  661. }
  662. static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
  663. int buffer_id = tensor_alloc->buffer_id;
  664. assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
  665. if (tensor->view_src != NULL) {
  666. if (tensor->buffer == NULL) {
  667. assert(tensor_alloc->offset == SIZE_MAX);
  668. if (tensor->view_src->buffer == NULL) {
  669. // this tensor was allocated without ggml-backend
  670. return;
  671. }
  672. ggml_backend_view_init(tensor);
  673. }
  674. } else {
  675. if (tensor->data == NULL) {
  676. assert(tensor_alloc->offset != SIZE_MAX);
  677. assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
  678. void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
  679. void * addr = (char *)base + tensor_alloc->offset;
  680. ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
  681. } else {
  682. if (tensor->buffer == NULL) {
  683. // this tensor was allocated without ggml-backend
  684. return;
  685. }
  686. }
  687. }
  688. }
  689. static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
  690. size_t node_size = 0;
  691. if (!node->data && !node->view_src) {
  692. // If we previously had data but don't now then reallocate
  693. if (talloc->buffer_id < 0) {
  694. return false;
  695. }
  696. node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
  697. }
  698. return talloc->size_max >= node_size;
  699. }
  700. static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
  701. if (galloc->n_nodes != graph->n_nodes) {
  702. #ifndef NDEBUG
  703. GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
  704. #endif
  705. return true;
  706. }
  707. if (galloc->n_leafs != graph->n_leafs) {
  708. #ifndef NDEBUG
  709. GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
  710. #endif
  711. return true;
  712. }
  713. for (int i = 0; i < graph->n_nodes; i++) {
  714. struct ggml_tensor * node = graph->nodes[i];
  715. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  716. if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
  717. #ifndef NDEBUG
  718. GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
  719. #endif
  720. return true;
  721. }
  722. for (int j = 0; j < GGML_MAX_SRC; j++) {
  723. struct ggml_tensor * src = node->src[j];
  724. if (src == NULL) {
  725. continue;
  726. }
  727. if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
  728. #ifndef NDEBUG
  729. GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
  730. #endif
  731. return true;
  732. }
  733. }
  734. }
  735. return false;
  736. }
  737. bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
  738. if (ggml_gallocr_needs_realloc(galloc, graph)) {
  739. if (galloc->n_buffers == 1) {
  740. #ifndef NDEBUG
  741. GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
  742. #endif
  743. if (!ggml_gallocr_reserve(galloc, graph)) {
  744. return false;
  745. }
  746. } else {
  747. #ifndef NDEBUG
  748. GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
  749. #endif
  750. return false;
  751. }
  752. }
  753. // reset buffers
  754. for (int i = 0; i < galloc->n_buffers; i++) {
  755. if (galloc->buffers[i] != NULL) {
  756. ggml_backend_buffer_reset(galloc->buffers[i]);
  757. }
  758. }
  759. // allocate the graph tensors from the previous assignments
  760. // leafs
  761. for (int i = 0; i < graph->n_leafs; i++) {
  762. struct ggml_tensor * leaf = graph->leafs[i];
  763. struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
  764. ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
  765. }
  766. // nodes
  767. for (int i = 0; i < graph->n_nodes; i++) {
  768. struct ggml_tensor * node = graph->nodes[i];
  769. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  770. for (int j = 0; j < GGML_MAX_SRC; j++) {
  771. struct ggml_tensor * src = node->src[j];
  772. if (src == NULL) {
  773. continue;
  774. }
  775. ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
  776. }
  777. ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
  778. }
  779. return true;
  780. }
  781. size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
  782. GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
  783. if (galloc->buffers[buffer_id] == NULL) {
  784. return 0;
  785. }
  786. for (int i = 0; i < buffer_id; i++) {
  787. if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
  788. // this buffer is the same as a previous one due to the same buffer type being used multiple times
  789. // only return the buffer size the first time it appears to avoid double counting
  790. return 0;
  791. }
  792. }
  793. return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
  794. }
  795. // utils
  796. static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
  797. for (size_t i = 0; i < *n_buffers; i++) {
  798. ggml_backend_buffer_free((*buffers)[i]);
  799. }
  800. free(*buffers);
  801. }
  802. static bool alloc_tensor_range(struct ggml_context * ctx,
  803. struct ggml_tensor * first, struct ggml_tensor * last,
  804. ggml_backend_buffer_type_t buft, size_t size,
  805. ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
  806. ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
  807. if (buffer == NULL) {
  808. GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
  809. free_buffers(buffers, n_buffers);
  810. return false;
  811. }
  812. *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
  813. (*buffers)[(*n_buffers)++] = buffer;
  814. struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
  815. for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
  816. enum ggml_status status = GGML_STATUS_SUCCESS;
  817. if (t->data == NULL) {
  818. if (t->view_src == NULL) {
  819. status = ggml_tallocr_alloc(&tallocr, t);
  820. } else if (t->buffer == NULL) {
  821. status = ggml_backend_view_init(t);
  822. }
  823. } else {
  824. if (t->view_src != NULL && t->buffer == NULL) {
  825. // view of a pre-allocated tensor
  826. status = ggml_backend_view_init(t);
  827. }
  828. }
  829. if (status != GGML_STATUS_SUCCESS) {
  830. GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
  831. free_buffers(buffers, n_buffers);
  832. return false;
  833. }
  834. }
  835. return true;
  836. }
  837. ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
  838. GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
  839. size_t alignment = ggml_backend_buft_get_alignment(buft);
  840. size_t max_size = ggml_backend_buft_get_max_size(buft);
  841. ggml_backend_buffer_t * buffers = NULL;
  842. size_t n_buffers = 0;
  843. size_t cur_buf_size = 0;
  844. struct ggml_tensor * first = ggml_get_first_tensor(ctx);
  845. for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  846. size_t this_size = 0;
  847. if (t->data == NULL && t->view_src == NULL) {
  848. this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
  849. }
  850. if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
  851. // allocate tensors in the current buffer
  852. if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
  853. return NULL;
  854. }
  855. first = t;
  856. cur_buf_size = this_size;
  857. } else {
  858. cur_buf_size += this_size;
  859. }
  860. }
  861. // allocate remaining tensors
  862. if (cur_buf_size > 0) {
  863. if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
  864. return NULL;
  865. }
  866. }
  867. if (n_buffers == 0) {
  868. #ifndef NDEBUG
  869. GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
  870. #endif
  871. return NULL;
  872. }
  873. ggml_backend_buffer_t buffer;
  874. if (n_buffers == 1) {
  875. buffer = buffers[0];
  876. } else {
  877. buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
  878. }
  879. free(buffers);
  880. return buffer;
  881. }
  882. ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
  883. return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
  884. }