ggml-alloc.c 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203
  1. #include "ggml-alloc.h"
  2. #include "ggml-backend-impl.h"
  3. #include "ggml.h"
  4. #include "ggml-impl.h"
  5. #include <assert.h>
  6. #include <limits.h>
  7. #include <stdarg.h>
  8. #include <stdio.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  12. #define MAX_FREE_BLOCKS 256
  13. //#define GGML_ALLOCATOR_DEBUG
  14. //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
  15. #define AT_PRINTF(...)
  16. static bool ggml_is_view(const struct ggml_tensor * t) {
  17. return t->view_src != NULL;
  18. }
  19. // ops that return true for this function must not use restrict pointers for their backend implementations
  20. bool ggml_op_can_inplace(enum ggml_op op) {
  21. switch (op) {
  22. case GGML_OP_FILL:
  23. case GGML_OP_SCALE:
  24. case GGML_OP_DIAG_MASK_ZERO:
  25. case GGML_OP_DIAG_MASK_INF:
  26. case GGML_OP_ADD:
  27. case GGML_OP_ADD_ID:
  28. case GGML_OP_ADD1:
  29. case GGML_OP_SUB:
  30. case GGML_OP_MUL:
  31. case GGML_OP_DIV:
  32. case GGML_OP_SQR:
  33. case GGML_OP_SQRT:
  34. case GGML_OP_LOG:
  35. case GGML_OP_UNARY:
  36. case GGML_OP_ROPE:
  37. case GGML_OP_ROPE_BACK:
  38. case GGML_OP_SILU_BACK:
  39. case GGML_OP_RMS_NORM:
  40. case GGML_OP_RMS_NORM_BACK:
  41. case GGML_OP_SOFT_MAX:
  42. case GGML_OP_SOFT_MAX_BACK:
  43. return true;
  44. default:
  45. return false;
  46. }
  47. }
  48. static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
  49. assert(alignment && !(alignment & (alignment - 1))); // power of 2
  50. size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
  51. return offset + align;
  52. }
  53. // tallocr
  54. struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
  55. void * base = ggml_backend_buffer_get_base(buffer);
  56. size_t align = ggml_backend_buffer_get_alignment(buffer);
  57. assert(align && !(align & (align - 1))); // power of 2
  58. struct ggml_tallocr talloc = (struct ggml_tallocr) {
  59. /*.buffer = */ buffer,
  60. /*.base = */ base,
  61. /*.alignment = */ align,
  62. /*.offset = */ aligned_offset(base, 0, align),
  63. };
  64. return talloc;
  65. }
  66. enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
  67. size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
  68. size = GGML_PAD(size, talloc->alignment);
  69. if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
  70. GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
  71. __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
  72. GGML_ABORT("not enough space in the buffer");
  73. }
  74. void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
  75. talloc->offset += size;
  76. assert(((uintptr_t)addr % talloc->alignment) == 0);
  77. return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
  78. }
  79. // dynamic tensor allocator
  80. #define GGML_VBUFFER_MAX_CHUNKS 16
  81. // relative memory address within an allocation that can be split into multiple buffers (chunks)
  82. struct buffer_address {
  83. int chunk; // index of a backend buffer
  84. size_t offset; // local memory offset within the buffer
  85. };
  86. static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
  87. static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
  88. return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
  89. }
  90. struct free_block {
  91. size_t offset;
  92. size_t size;
  93. };
  94. struct tallocr_chunk {
  95. struct free_block free_blocks[MAX_FREE_BLOCKS];
  96. int n_free_blocks;
  97. size_t max_size;
  98. };
  99. struct ggml_dyn_tallocr {
  100. size_t alignment;
  101. size_t max_chunk_size;
  102. struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
  103. int n_chunks;
  104. #ifdef GGML_ALLOCATOR_DEBUG
  105. struct {
  106. const struct ggml_tensor * tensor;
  107. struct buffer_address addr;
  108. } allocated_tensors[1024];
  109. #endif
  110. };
  111. static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
  112. GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
  113. // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
  114. int insert_pos = 0;
  115. while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
  116. insert_pos++;
  117. }
  118. // shift all blocks from insert_pos onward to make room for the new block
  119. for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
  120. chunk->free_blocks[i] = chunk->free_blocks[i-1];
  121. }
  122. // insert the new block
  123. chunk->free_blocks[insert_pos].offset = offset;
  124. chunk->free_blocks[insert_pos].size = size;
  125. chunk->n_free_blocks++;
  126. }
  127. static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
  128. // shift all elements after idx by 1 to the left, overwriting the element at idx
  129. for (int i = idx; i < chunk->n_free_blocks; i++) {
  130. chunk->free_blocks[i] = chunk->free_blocks[i+1];
  131. }
  132. chunk->n_free_blocks--;
  133. }
  134. static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
  135. if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
  136. return -1;
  137. }
  138. struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
  139. chunk->n_free_blocks = 1;
  140. chunk->free_blocks[0].offset = 0;
  141. // available space in a chunk is limited to max_chunk_size, but can be higher if:
  142. // 1. a single tensor exceeds the maximum, and cannot fit any other way
  143. // 2. we are running out of chunks
  144. // backends will either manage to allocate the larger size, or report an error.
  145. chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
  146. if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
  147. chunk->free_blocks[0].size = SIZE_MAX/2;
  148. }
  149. alloc->chunks[alloc->n_chunks] = chunk;
  150. alloc->n_chunks++;
  151. return alloc->n_chunks - 1;
  152. }
  153. #ifdef GGML_ALLOCATOR_DEBUG
  154. static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
  155. for (int i = 0; i < 1024; i++) {
  156. if (alloc->allocated_tensors[i].tensor == NULL) {
  157. alloc->allocated_tensors[i].tensor = tensor;
  158. alloc->allocated_tensors[i].addr = addr;
  159. return;
  160. }
  161. }
  162. GGML_ABORT("out of allocated_tensors");
  163. }
  164. static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
  165. for (int i = 0; i < 1024; i++) {
  166. if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
  167. alloc->allocated_tensors[i].tensor = NULL;
  168. return;
  169. }
  170. }
  171. GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
  172. }
  173. #endif
  174. static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
  175. size = aligned_offset(NULL, size, alloc->alignment);
  176. AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
  177. int best_fit_chunk = -1;
  178. int best_fit_block = -1;
  179. size_t max_avail = 0;
  180. // find the best fitting free block besides the last block, within any chunk
  181. for (int c = 0; c < alloc->n_chunks; ++c) {
  182. struct tallocr_chunk * chunk = alloc->chunks[c];
  183. size_t best_fit_size = SIZE_MAX;
  184. for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
  185. struct free_block * block = &chunk->free_blocks[i];
  186. max_avail = MAX(max_avail, block->size);
  187. if (block->size >= size && block->size <= best_fit_size) {
  188. best_fit_chunk = c;
  189. best_fit_block = i;
  190. best_fit_size = block->size;
  191. }
  192. }
  193. }
  194. if (best_fit_block == -1) {
  195. // no suitable block found, try the last block (this may grow a chunks size)
  196. int64_t best_reuse = INT64_MIN;
  197. for (int c = 0; c < alloc->n_chunks; ++c) {
  198. struct tallocr_chunk * chunk = alloc->chunks[c];
  199. if (chunk->n_free_blocks > 0) {
  200. struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
  201. max_avail = MAX(max_avail, block->size);
  202. int64_t reuse_factor = chunk->max_size - block->offset - size;
  203. // reuse_factor < 0 : amount of extra memory that needs to be allocated
  204. // reuse_factor = 0 : allocated free space exactly matches tensor size
  205. // reuse_factor > 0 : superfluous memory that will remain unused
  206. bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse;
  207. bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse;
  208. if (block->size >= size && (better_reuse || better_fit)) {
  209. best_fit_chunk = c;
  210. best_fit_block = chunk->n_free_blocks - 1;
  211. best_reuse = reuse_factor;
  212. }
  213. }
  214. }
  215. }
  216. if (best_fit_block == -1) {
  217. // none of the existing chunks have enough space left
  218. best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
  219. best_fit_block = 0;
  220. }
  221. if (best_fit_chunk == -1) {
  222. // since the last chunk always has virtually endless memory, this should never happen
  223. GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
  224. __func__, size, max_avail);
  225. GGML_ABORT("graph allocation: failed to reserve memory");
  226. }
  227. struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
  228. struct free_block * block = &chunk->free_blocks[best_fit_block];
  229. struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
  230. block->offset += size;
  231. block->size -= size;
  232. if (block->size == 0) {
  233. // remove block if empty
  234. ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
  235. }
  236. AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
  237. #ifdef GGML_ALLOCATOR_DEBUG
  238. add_allocated_tensor(alloc, addr, tensor);
  239. size_t cur_max = addr.offset + size;
  240. if (cur_max > chunk->max_size) {
  241. // sort allocated_tensors by chunk/offset
  242. for (int i = 0; i < 1024; i++) {
  243. for (int j = i + 1; j < 1024; j++) {
  244. if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
  245. const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
  246. struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
  247. alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
  248. alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
  249. alloc->allocated_tensors[j].tensor = tmp_tensor;
  250. alloc->allocated_tensors[j].addr = tmp_addr;
  251. }
  252. }
  253. }
  254. GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
  255. for (int i = 0; i < 1024; i++) {
  256. if (alloc->allocated_tensors[i].tensor) {
  257. GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
  258. alloc->allocated_tensors[i].addr.chunk,
  259. alloc->allocated_tensors[i].addr.offset,
  260. alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
  261. ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
  262. }
  263. }
  264. GGML_LOG_DEBUG("\n");
  265. }
  266. #endif
  267. chunk->max_size = MAX(chunk->max_size, addr.offset + size);
  268. return addr;
  269. GGML_UNUSED(tensor);
  270. }
  271. // this is a very naive implementation, but for our case the number of free blocks should be very small
  272. static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
  273. size = aligned_offset(NULL, size, alloc->alignment);
  274. AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
  275. __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
  276. #ifdef GGML_ALLOCATOR_DEBUG
  277. remove_allocated_tensor(alloc, addr, tensor);
  278. #endif
  279. struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
  280. // see if we can merge with an existing block
  281. for (int i = 0; i < chunk->n_free_blocks; i++) {
  282. struct free_block * block = &chunk->free_blocks[i];
  283. // check if ptr is at the end of the block
  284. if (block->offset + block->size == addr.offset) {
  285. block->size += size;
  286. // check if we can merge with the next block
  287. if (i < chunk->n_free_blocks - 1) {
  288. struct free_block * next = &chunk->free_blocks[i+1];
  289. if (block->offset + block->size == next->offset) {
  290. block->size += next->size;
  291. ggml_dyn_tallocr_remove_block(chunk, i+1);
  292. }
  293. }
  294. return;
  295. }
  296. // check if ptr is at the beginning of the block
  297. if (addr.offset + size == block->offset) {
  298. block->offset = addr.offset;
  299. block->size += size;
  300. // check if we can merge with the previous block
  301. if (i > 0) {
  302. struct free_block * prev = &chunk->free_blocks[i-1];
  303. if (prev->offset + prev->size == block->offset) {
  304. prev->size += block->size;
  305. ggml_dyn_tallocr_remove_block(chunk, i);
  306. }
  307. }
  308. return;
  309. }
  310. }
  311. // otherwise, add a new block
  312. ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
  313. GGML_UNUSED(tensor);
  314. }
  315. static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
  316. for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
  317. free(alloc->chunks[i]);
  318. alloc->chunks[i] = NULL;
  319. }
  320. alloc->n_chunks = 0;
  321. #ifdef GGML_ALLOCATOR_DEBUG
  322. for (int i = 0; i < 1024; i++) {
  323. alloc->allocated_tensors[i].tensor = NULL;
  324. }
  325. #endif
  326. }
  327. static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
  328. struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
  329. *alloc = (struct ggml_dyn_tallocr) {
  330. /*.alignment = */ alignment,
  331. /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
  332. /*.chunks = */ {NULL},
  333. /*.n_chunks = */ 0,
  334. #ifdef GGML_ALLOCATOR_DEBUG
  335. /*.allocated_tensors = */ {{0}},
  336. #endif
  337. };
  338. ggml_dyn_tallocr_reset(alloc);
  339. return alloc;
  340. }
  341. static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
  342. for (int i = 0; i < alloc->n_chunks; ++i) {
  343. free(alloc->chunks[i]);
  344. }
  345. free(alloc);
  346. }
  347. static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
  348. return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
  349. }
  350. // virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
  351. struct vbuffer {
  352. ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
  353. };
  354. static void ggml_vbuffer_free(struct vbuffer * buf) {
  355. if (buf == NULL) {
  356. return;
  357. }
  358. for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
  359. ggml_backend_buffer_free(buf->chunks[i]);
  360. }
  361. free(buf);
  362. }
  363. static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
  364. return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
  365. }
  366. static size_t ggml_vbuffer_size(struct vbuffer * buf) {
  367. size_t size = 0;
  368. for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
  369. size += ggml_backend_buffer_get_size(buf->chunks[i]);
  370. }
  371. return size;
  372. }
  373. static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
  374. struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
  375. if (buf == NULL) {
  376. return NULL;
  377. }
  378. for (int n = 0; n < talloc->n_chunks; n++) {
  379. size_t chunk_size = talloc->chunks[n]->max_size;
  380. buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
  381. if (buf->chunks[n] == NULL) {
  382. ggml_vbuffer_free(buf);
  383. return NULL;
  384. }
  385. ggml_backend_buffer_set_usage(buf->chunks[n], usage);
  386. }
  387. return buf;
  388. }
  389. static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
  390. void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
  391. void * addr = (char *)base + buf_addr.offset;
  392. ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
  393. }
  394. static void ggml_vbuffer_reset(struct vbuffer * buf) {
  395. for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
  396. ggml_backend_buffer_reset(buf->chunks[i]);
  397. }
  398. }
  399. /////////////////////////////////////
  400. // graph allocator
  401. struct hash_node {
  402. int n_children;
  403. int n_views;
  404. int buffer_id;
  405. struct buffer_address addr;
  406. bool allocated;
  407. };
  408. struct tensor_alloc {
  409. int buffer_id;
  410. struct buffer_address addr;
  411. size_t size_max; // 0 = pre-allocated, unused, or view
  412. };
  413. struct leaf_alloc {
  414. struct tensor_alloc leaf;
  415. };
  416. struct node_alloc {
  417. struct tensor_alloc dst;
  418. struct tensor_alloc src[GGML_MAX_SRC];
  419. };
  420. struct ggml_gallocr {
  421. ggml_backend_buffer_type_t * bufts; // [n_buffers]
  422. struct vbuffer ** buffers; // [n_buffers]
  423. struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
  424. int n_buffers;
  425. struct ggml_hash_set hash_set;
  426. struct hash_node * hash_values; // [hash_set.size]
  427. struct node_alloc * node_allocs; // [n_nodes]
  428. int n_nodes;
  429. struct leaf_alloc * leaf_allocs; // [n_leafs]
  430. int n_leafs;
  431. };
  432. ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
  433. ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
  434. GGML_ASSERT(galloc != NULL);
  435. galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
  436. GGML_ASSERT(galloc->bufts != NULL);
  437. galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
  438. GGML_ASSERT(galloc->buffers != NULL);
  439. galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
  440. GGML_ASSERT(galloc->buf_tallocs != NULL);
  441. for (int i = 0; i < n_bufs; i++) {
  442. galloc->bufts[i] = bufts[i];
  443. galloc->buffers[i] = NULL;
  444. // check if the same buffer type is used multiple times and reuse the same allocator
  445. for (int j = 0; j < i; j++) {
  446. if (bufts[i] == bufts[j]) {
  447. galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
  448. break;
  449. }
  450. }
  451. if (galloc->buf_tallocs[i] == NULL) {
  452. size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
  453. size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
  454. galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
  455. }
  456. }
  457. galloc->n_buffers = n_bufs;
  458. return galloc;
  459. }
  460. ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
  461. return ggml_gallocr_new_n(&buft, 1);
  462. }
  463. void ggml_gallocr_free(ggml_gallocr_t galloc) {
  464. if (galloc == NULL) {
  465. return;
  466. }
  467. for (int i = 0; i < galloc->n_buffers; i++) {
  468. if (galloc->buffers != NULL) {
  469. // skip if already freed
  470. bool freed = false;
  471. for (int j = 0; j < i; j++) {
  472. if (galloc->buffers[j] == galloc->buffers[i]) {
  473. freed = true;
  474. break;
  475. }
  476. }
  477. if (!freed) {
  478. ggml_vbuffer_free(galloc->buffers[i]);
  479. }
  480. }
  481. if (galloc->buf_tallocs != NULL) {
  482. // skip if already freed
  483. bool freed = false;
  484. for (int j = 0; j < i; j++) {
  485. if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
  486. freed = true;
  487. break;
  488. }
  489. }
  490. if (!freed) {
  491. ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
  492. }
  493. }
  494. }
  495. ggml_hash_set_free(&galloc->hash_set);
  496. free(galloc->hash_values);
  497. free(galloc->bufts);
  498. free(galloc->buffers);
  499. free(galloc->buf_tallocs);
  500. free(galloc->node_allocs);
  501. free(galloc->leaf_allocs);
  502. free(galloc);
  503. }
  504. typedef struct ggml_gallocr * ggml_gallocr_t;
  505. static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  506. size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
  507. return &galloc->hash_values[i];
  508. }
  509. static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  510. return ggml_gallocr_hash_get(galloc, t)->allocated;
  511. }
  512. static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
  513. return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
  514. }
  515. // free the extra space at the end if the new tensor is smaller
  516. static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_tensor * node, struct ggml_tensor * parent) {
  517. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  518. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  519. size_t parent_size = ggml_backend_buft_get_alloc_size(galloc->bufts[p_hn->buffer_id], parent);
  520. size_t node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
  521. GGML_ASSERT(parent_size >= node_size);
  522. if (parent_size > node_size) {
  523. struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id];
  524. struct buffer_address p_addr = p_hn->addr;
  525. p_addr.offset += node_size;
  526. size_t extra_size = parent_size - node_size;
  527. AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name);
  528. ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent);
  529. }
  530. }
  531. static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
  532. GGML_ASSERT(buffer_id >= 0);
  533. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  534. if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
  535. hn->allocated = true;
  536. assert(hn->addr.offset == 0);
  537. // try to reuse a parent's buffer (inplace)
  538. if (ggml_op_can_inplace(node->op)) {
  539. for (int i = 0; i < GGML_MAX_SRC; i++) {
  540. struct ggml_tensor * parent = node->src[i];
  541. if (parent == NULL) {
  542. continue;
  543. }
  544. // if the node's data is external, then we cannot re-use it
  545. if (!ggml_gallocr_is_own(galloc, parent)) {
  546. AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
  547. continue;
  548. }
  549. // outputs cannot be reused
  550. if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
  551. AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
  552. continue;
  553. }
  554. if (!ggml_are_same_layout(node, parent)) {
  555. AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
  556. continue;
  557. }
  558. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  559. if (p_hn->n_children == 1 && p_hn->n_views == 0) {
  560. if (ggml_is_view(parent)) {
  561. struct ggml_tensor * view_src = parent->view_src;
  562. struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
  563. if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
  564. AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
  565. assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
  566. hn->buffer_id = p_hn->buffer_id;
  567. hn->addr = p_hn->addr;
  568. p_hn->allocated = false; // avoid freeing the parent
  569. view_src_hn->allocated = false;
  570. ggml_gallocr_free_extra_space(galloc, node, view_src);
  571. return;
  572. }
  573. } else {
  574. AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
  575. hn->buffer_id = p_hn->buffer_id;
  576. hn->addr = p_hn->addr;
  577. p_hn->allocated = false; // avoid freeing the parent
  578. ggml_gallocr_free_extra_space(galloc, node, parent);
  579. return;
  580. }
  581. }
  582. }
  583. }
  584. // allocate tensor from the buffer
  585. struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
  586. ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
  587. size_t size = ggml_backend_buft_get_alloc_size(buft, node);
  588. hn->buffer_id = buffer_id;
  589. hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
  590. }
  591. }
  592. static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
  593. // graph outputs are never freed
  594. if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
  595. AT_PRINTF("not freeing output %s\n", node->name);
  596. return;
  597. }
  598. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  599. int buffer_id = hn->buffer_id;
  600. struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
  601. ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
  602. size_t size = ggml_backend_buft_get_alloc_size(buft, node);
  603. ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
  604. hn->allocated = false;
  605. }
  606. static int get_node_buffer_id(const int * node_buffer_ids, int i) {
  607. return node_buffer_ids ? node_buffer_ids[i] : 0;
  608. }
  609. static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
  610. // clear hash tables
  611. ggml_hash_set_reset(&galloc->hash_set);
  612. memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
  613. // allocate leafs
  614. // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
  615. for (int i = 0; i < graph->n_leafs; i++) {
  616. struct ggml_tensor * leaf = graph->leafs[i];
  617. ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
  618. }
  619. // count number of children and views
  620. // allocate other graph inputs and leafs first to avoid overwriting them
  621. for (int i = 0; i < graph->n_nodes; i++) {
  622. struct ggml_tensor * node = graph->nodes[i];
  623. // TODO: better way to add external dependencies
  624. // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
  625. // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
  626. // itself is never used and should not be considered a dependency
  627. if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
  628. struct ggml_tensor * view_src = node->view_src;
  629. ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
  630. }
  631. if (node->flags & GGML_TENSOR_FLAG_INPUT) {
  632. ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
  633. }
  634. for (int j = 0; j < GGML_MAX_SRC; j++) {
  635. struct ggml_tensor * src = node->src[j];
  636. if (src == NULL) {
  637. continue;
  638. }
  639. ggml_gallocr_hash_get(galloc, src)->n_children += 1;
  640. // allocate explicit inputs
  641. if (src->flags & GGML_TENSOR_FLAG_INPUT) {
  642. ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
  643. }
  644. }
  645. }
  646. // allocate tensors
  647. for (int i = 0; i < graph->n_nodes; i++) {
  648. struct ggml_tensor * node = graph->nodes[i];
  649. int buffer_id = get_node_buffer_id(node_buffer_ids, i);
  650. // allocate parents (only leafs need to be allocated at this point)
  651. for (int j = 0; j < GGML_MAX_SRC; j++) {
  652. struct ggml_tensor * parent = node->src[j];
  653. if (parent == NULL) {
  654. continue;
  655. }
  656. ggml_gallocr_allocate_node(galloc, parent, buffer_id);
  657. }
  658. // allocate node
  659. ggml_gallocr_allocate_node(galloc, node, buffer_id);
  660. AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
  661. for (int j = 0; j < GGML_MAX_SRC; j++) {
  662. struct ggml_tensor * parent = node->src[j];
  663. if (parent == NULL) {
  664. continue;
  665. }
  666. AT_PRINTF("%s", parent->name);
  667. if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
  668. AT_PRINTF(", ");
  669. }
  670. }
  671. AT_PRINTF("\n");
  672. // update parents
  673. for (int j = 0; j < GGML_MAX_SRC; j++) {
  674. struct ggml_tensor * parent = node->src[j];
  675. if (parent == NULL) {
  676. continue;
  677. }
  678. struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
  679. p_hn->n_children -= 1;
  680. AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
  681. parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
  682. if (p_hn->n_children == 0 && p_hn->n_views == 0) {
  683. if (ggml_is_view(parent)) {
  684. struct ggml_tensor * view_src = parent->view_src;
  685. struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
  686. view_src_hn->n_views -= 1;
  687. AT_PRINTF("view_src %s: %d children, %d views\n",
  688. view_src->name, view_src_hn->n_children, view_src_hn->n_views);
  689. if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
  690. ggml_gallocr_free_node(galloc, view_src);
  691. }
  692. }
  693. else if (p_hn->allocated) {
  694. ggml_gallocr_free_node(galloc, parent);
  695. }
  696. }
  697. AT_PRINTF("\n");
  698. }
  699. }
  700. }
  701. bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
  702. size_t min_hash_size = graph->n_nodes + graph->n_leafs;
  703. // add 25% margin to avoid hash collisions
  704. min_hash_size += min_hash_size / 4;
  705. // initialize hash table
  706. if (galloc->hash_set.size < min_hash_size) {
  707. ggml_hash_set_free(&galloc->hash_set);
  708. galloc->hash_set = ggml_hash_set_new(min_hash_size);
  709. GGML_ASSERT(galloc->hash_set.keys != NULL);
  710. free(galloc->hash_values);
  711. galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
  712. GGML_ASSERT(galloc->hash_values != NULL);
  713. }
  714. // reset allocators
  715. for (int i = 0; i < galloc->n_buffers; i++) {
  716. ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
  717. }
  718. // allocate in hash table
  719. ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
  720. // set the node_allocs from the hash table
  721. if (galloc->n_nodes < graph->n_nodes) {
  722. free(galloc->node_allocs);
  723. galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
  724. GGML_ASSERT(galloc->node_allocs != NULL);
  725. }
  726. galloc->n_nodes = graph->n_nodes;
  727. for (int i = 0; i < graph->n_nodes; i++) {
  728. struct ggml_tensor * node = graph->nodes[i];
  729. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  730. if (node->view_src || node->data) {
  731. node_alloc->dst.buffer_id = -1;
  732. node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
  733. node_alloc->dst.size_max = 0;
  734. } else {
  735. struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
  736. node_alloc->dst.buffer_id = hn->buffer_id;
  737. node_alloc->dst.addr = hn->addr;
  738. node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
  739. }
  740. for (int j = 0; j < GGML_MAX_SRC; j++) {
  741. struct ggml_tensor * src = node->src[j];
  742. if (!src || src->view_src || src->data) {
  743. node_alloc->src[j].buffer_id = -1;
  744. node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
  745. node_alloc->src[j].size_max = 0;
  746. } else {
  747. struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
  748. node_alloc->src[j].buffer_id = hn->buffer_id;
  749. node_alloc->src[j].addr = hn->addr;
  750. node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
  751. }
  752. }
  753. }
  754. if (galloc->n_leafs < graph->n_leafs) {
  755. free(galloc->leaf_allocs);
  756. galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
  757. GGML_ASSERT(galloc->leaf_allocs != NULL);
  758. }
  759. galloc->n_leafs = graph->n_leafs;
  760. for (int i = 0; i < graph->n_leafs; i++) {
  761. struct ggml_tensor * leaf = graph->leafs[i];
  762. struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
  763. if (leaf->view_src || leaf->data) {
  764. galloc->leaf_allocs[i].leaf.buffer_id = -1;
  765. galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
  766. galloc->leaf_allocs[i].leaf.size_max = 0;
  767. } else {
  768. galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
  769. galloc->leaf_allocs[i].leaf.addr = hn->addr;
  770. galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
  771. }
  772. }
  773. // reallocate buffers if needed
  774. for (int i = 0; i < galloc->n_buffers; i++) {
  775. // if the buffer type is used multiple times, we reuse the same buffer
  776. for (int j = 0; j < i; j++) {
  777. if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
  778. galloc->buffers[i] = galloc->buffers[j];
  779. break;
  780. }
  781. }
  782. // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
  783. bool realloc = galloc->buffers[i] == NULL;
  784. size_t new_size = 0;
  785. for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
  786. size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
  787. size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
  788. new_size += new_chunk_size;
  789. if (new_chunk_size > cur_chunk_size) {
  790. realloc = true;
  791. }
  792. }
  793. if (realloc) {
  794. #ifndef NDEBUG
  795. {
  796. size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
  797. if (cur_size > 0) {
  798. GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
  799. __func__, ggml_backend_buft_name(galloc->bufts[i]),
  800. cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  801. }
  802. }
  803. #endif
  804. ggml_vbuffer_free(galloc->buffers[i]);
  805. galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
  806. if (galloc->buffers[i] == NULL) {
  807. GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
  808. return false;
  809. }
  810. }
  811. }
  812. return true;
  813. }
  814. bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
  815. return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
  816. }
  817. static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
  818. int buffer_id = tensor_alloc->buffer_id;
  819. assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
  820. if (tensor->view_src != NULL) {
  821. if (tensor->buffer == NULL) {
  822. assert(tensor_alloc->addr.offset == SIZE_MAX);
  823. if (tensor->view_src->buffer == NULL) {
  824. // this tensor was allocated without ggml-backend
  825. return;
  826. }
  827. ggml_backend_view_init(tensor);
  828. }
  829. } else {
  830. if (tensor->data == NULL) {
  831. assert(tensor_alloc->addr.offset != SIZE_MAX);
  832. assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
  833. ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
  834. } else {
  835. if (tensor->buffer == NULL) {
  836. // this tensor was allocated without ggml-backend
  837. return;
  838. }
  839. }
  840. }
  841. }
  842. static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
  843. size_t node_size = 0;
  844. if (!node->data && !node->view_src) {
  845. // If we previously had data but don't now then reallocate
  846. if (talloc->buffer_id < 0) {
  847. return false;
  848. }
  849. node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
  850. }
  851. return talloc->size_max >= node_size;
  852. }
  853. static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
  854. if (galloc->n_nodes != graph->n_nodes) {
  855. #ifndef NDEBUG
  856. GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
  857. #endif
  858. return true;
  859. }
  860. if (galloc->n_leafs != graph->n_leafs) {
  861. #ifndef NDEBUG
  862. GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
  863. #endif
  864. return true;
  865. }
  866. for (int i = 0; i < graph->n_nodes; i++) {
  867. struct ggml_tensor * node = graph->nodes[i];
  868. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  869. if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
  870. #ifndef NDEBUG
  871. GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
  872. #endif
  873. return true;
  874. }
  875. for (int j = 0; j < GGML_MAX_SRC; j++) {
  876. struct ggml_tensor * src = node->src[j];
  877. if (src == NULL) {
  878. continue;
  879. }
  880. if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
  881. #ifndef NDEBUG
  882. GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
  883. #endif
  884. return true;
  885. }
  886. }
  887. }
  888. return false;
  889. }
  890. bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
  891. if (ggml_gallocr_needs_realloc(galloc, graph)) {
  892. if (galloc->n_buffers == 1) {
  893. #ifndef NDEBUG
  894. GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
  895. #endif
  896. if (!ggml_gallocr_reserve(galloc, graph)) {
  897. return false;
  898. }
  899. } else {
  900. #ifndef NDEBUG
  901. GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
  902. #endif
  903. return false;
  904. }
  905. }
  906. // reset buffers
  907. for (int i = 0; i < galloc->n_buffers; i++) {
  908. if (galloc->buffers[i] != NULL) {
  909. ggml_vbuffer_reset(galloc->buffers[i]);
  910. }
  911. }
  912. // allocate the graph tensors from the previous assignments
  913. // leafs
  914. for (int i = 0; i < graph->n_leafs; i++) {
  915. struct ggml_tensor * leaf = graph->leafs[i];
  916. struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
  917. ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
  918. }
  919. // nodes
  920. for (int i = 0; i < graph->n_nodes; i++) {
  921. struct ggml_tensor * node = graph->nodes[i];
  922. struct node_alloc * node_alloc = &galloc->node_allocs[i];
  923. for (int j = 0; j < GGML_MAX_SRC; j++) {
  924. struct ggml_tensor * src = node->src[j];
  925. if (src == NULL) {
  926. continue;
  927. }
  928. ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
  929. }
  930. ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
  931. }
  932. return true;
  933. }
  934. size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
  935. GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
  936. if (galloc->buffers[buffer_id] == NULL) {
  937. return 0;
  938. }
  939. for (int i = 0; i < buffer_id; i++) {
  940. if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
  941. // this buffer is the same as a previous one due to the same buffer type being used multiple times
  942. // only return the buffer size the first time it appears to avoid double counting
  943. return 0;
  944. }
  945. }
  946. return ggml_vbuffer_size(galloc->buffers[buffer_id]);
  947. }
  948. // utils
  949. static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
  950. for (size_t i = 0; i < *n_buffers; i++) {
  951. ggml_backend_buffer_free((*buffers)[i]);
  952. }
  953. free(*buffers);
  954. }
  955. static bool alloc_tensor_range(struct ggml_context * ctx,
  956. struct ggml_tensor * first, struct ggml_tensor * last,
  957. ggml_backend_buffer_type_t buft, size_t size,
  958. ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
  959. ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
  960. if (buffer == NULL) {
  961. GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
  962. free_buffers(buffers, n_buffers);
  963. return false;
  964. }
  965. *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
  966. (*buffers)[(*n_buffers)++] = buffer;
  967. struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
  968. for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
  969. enum ggml_status status = GGML_STATUS_SUCCESS;
  970. if (t->data == NULL) {
  971. if (t->view_src == NULL) {
  972. status = ggml_tallocr_alloc(&tallocr, t);
  973. } else if (t->buffer == NULL) {
  974. status = ggml_backend_view_init(t);
  975. }
  976. } else {
  977. if (t->view_src != NULL && t->buffer == NULL) {
  978. // view of a pre-allocated tensor
  979. status = ggml_backend_view_init(t);
  980. }
  981. }
  982. if (status != GGML_STATUS_SUCCESS) {
  983. GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
  984. free_buffers(buffers, n_buffers);
  985. return false;
  986. }
  987. }
  988. return true;
  989. }
  990. ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
  991. GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
  992. size_t alignment = ggml_backend_buft_get_alignment(buft);
  993. size_t max_size = ggml_backend_buft_get_max_size(buft);
  994. ggml_backend_buffer_t * buffers = NULL;
  995. size_t n_buffers = 0;
  996. size_t cur_buf_size = 0;
  997. struct ggml_tensor * first = ggml_get_first_tensor(ctx);
  998. for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
  999. size_t this_size = 0;
  1000. if (t->data == NULL && t->view_src == NULL) {
  1001. this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
  1002. }
  1003. if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
  1004. // allocate tensors in the current buffer
  1005. if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
  1006. return NULL;
  1007. }
  1008. first = t;
  1009. cur_buf_size = this_size;
  1010. } else {
  1011. cur_buf_size += this_size;
  1012. }
  1013. }
  1014. // allocate remaining tensors
  1015. if (cur_buf_size > 0) {
  1016. if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
  1017. return NULL;
  1018. }
  1019. }
  1020. if (n_buffers == 0) {
  1021. #ifndef NDEBUG
  1022. GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
  1023. #endif
  1024. return NULL;
  1025. }
  1026. ggml_backend_buffer_t buffer;
  1027. if (n_buffers == 1) {
  1028. buffer = buffers[0];
  1029. } else {
  1030. buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
  1031. }
  1032. free(buffers);
  1033. return buffer;
  1034. }
  1035. ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
  1036. return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
  1037. }