test-alloc.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. #include <ggml-alloc.h>
  2. #include <ggml-backend-impl.h>
  3. #include <ggml-cpp.h>
  4. #include <ggml-impl.h>
  5. #include <ggml.h>
  6. #include <algorithm>
  7. #include <exception>
  8. #include <memory>
  9. #include <vector>
  10. //
  11. // dummy backend with configurable max_buffer_size, tracks allocations
  12. uint8_t * const alloc_base = (uint8_t *) 16;
  13. struct dummy_backend_context {
  14. size_t max_buffer_size = 64;
  15. size_t alignment = 8;
  16. ggml_backend_buffer_i buffer_interface;
  17. std::vector<ggml_backend_buffer_t> buffers;
  18. size_t allocated_total() const {
  19. size_t n = 0;
  20. for (ggml_backend_buffer_t buf : buffers) {
  21. n += ggml_backend_buffer_get_size(buf);
  22. }
  23. return n;
  24. }
  25. };
  26. // ggml_backend_buffer_type interface
  27. static const char * dummy_backend_buffer_type_get_name(ggml_backend_buffer_type_t) {
  28. return "dummy_buffer_type";
  29. }
  30. static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  31. dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
  32. ggml_backend_buffer_t & buffer = ctx->buffers.emplace_back();
  33. buffer = ggml_backend_buffer_init(buft, ctx->buffer_interface, ctx, size);
  34. return buffer;
  35. }
  36. static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
  37. dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
  38. return ctx->alignment;
  39. }
  40. static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
  41. dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
  42. return ctx->max_buffer_size;
  43. }
  44. static bool dummy_backend_buffer_type_is_host(ggml_backend_buffer_type_t) {
  45. return true;
  46. }
  47. // ggml_backend_buffer interface
  48. static void dummy_backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
  49. dummy_backend_context * ctx = (dummy_backend_context *) buffer->context;
  50. auto i = std::find(ctx->buffers.begin(), ctx->buffers.end(), buffer);
  51. GGML_ASSERT(i != ctx->buffers.end());
  52. ctx->buffers.erase(i);
  53. }
  54. static void * dummy_backend_buffer_get_base(ggml_backend_buffer_t) {
  55. return alloc_base;
  56. }
  57. static ggml_status dummy_backend_buffer_init_tensor(ggml_backend_buffer_t, ggml_tensor *) {
  58. return GGML_STATUS_SUCCESS;
  59. }
  60. static void dummy_backend_buffer_memset_tensor(ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {}
  61. static void dummy_backend_buffer_set_tensor(ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {}
  62. static void dummy_backend_buffer_get_tensor(ggml_backend_buffer_t, const ggml_tensor *, void *, size_t, size_t) {}
  63. static void dummy_backend_buffer_clear(ggml_backend_buffer_t, uint8_t) {}
  64. // dummy_backend (not really a full backend, just provides what gallocr needs)
  65. struct dummy_backend {
  66. std::unique_ptr<dummy_backend_context> context;
  67. ggml_backend_buffer_type buffer_type;
  68. };
  69. static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
  70. dummy_backend b{};
  71. b.context = std::make_unique<dummy_backend_context>();
  72. b.context->alignment = alignment;
  73. b.context->max_buffer_size = max_buffer_size;
  74. b.context->buffer_interface.free_buffer = dummy_backend_buffer_free_buffer;
  75. b.context->buffer_interface.get_base = dummy_backend_buffer_get_base;
  76. b.context->buffer_interface.init_tensor = dummy_backend_buffer_init_tensor;
  77. b.context->buffer_interface.memset_tensor = dummy_backend_buffer_memset_tensor;
  78. b.context->buffer_interface.set_tensor = dummy_backend_buffer_set_tensor;
  79. b.context->buffer_interface.get_tensor = dummy_backend_buffer_get_tensor;
  80. b.context->buffer_interface.clear = dummy_backend_buffer_clear;
  81. b.buffer_type.context = b.context.get();
  82. b.buffer_type.iface.get_name = dummy_backend_buffer_type_get_name;
  83. b.buffer_type.iface.alloc_buffer = dummy_backend_buffer_type_alloc_buffer;
  84. b.buffer_type.iface.get_alignment = dummy_backend_buffer_type_get_alignment;
  85. b.buffer_type.iface.get_max_size = dummy_backend_buffer_type_get_max_size;
  86. b.buffer_type.iface.is_host = dummy_backend_buffer_type_is_host;
  87. return b;
  88. }
  89. //
  90. // test utilities
  91. struct test_context_with_graph {
  92. ggml_context * ctx;
  93. ggml_cgraph * graph;
  94. ggml_context_ptr ctx_ptr;
  95. };
  96. static test_context_with_graph make_context() {
  97. ggml_init_params params{};
  98. params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
  99. params.no_alloc = true;
  100. ggml_context * ctx = ggml_init(params);
  101. ggml_context_ptr ctx_ptr = ggml_context_ptr(ctx);
  102. ggml_cgraph * graph = ggml_new_graph(ctx);
  103. return { ctx, graph, std::move(ctx_ptr) };
  104. }
  105. static ggml_tensor * make_input_1d(ggml_context * ctx, int64_t n_elements) {
  106. ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
  107. ggml_set_input(t);
  108. return t;
  109. }
  110. static ggml_tensor * make_input_with_size(ggml_context * ctx, size_t size_bytes) {
  111. GGML_ASSERT(size_bytes % 4 == 0);
  112. return make_input_1d(ctx, size_bytes / 4);
  113. }
  114. static void assign_names(ggml_context * ctx, const char * prefix = "x") {
  115. int i = 0;
  116. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
  117. ggml_format_name(t, "%s%d", prefix, i++);
  118. }
  119. }
  120. static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
  121. for (int i = 0; i < graph->n_leafs; ++i) {
  122. if (strncmp(graph->leafs[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
  123. return i;
  124. }
  125. }
  126. fprintf(stderr, "leaf not found: %s\n", tensor_name);
  127. return -1;
  128. }
  129. static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
  130. for (int i = 0; i < graph->n_nodes; ++i) {
  131. if (strncmp(graph->nodes[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
  132. return i;
  133. }
  134. }
  135. fprintf(stderr, "node not found: %s", tensor_name);
  136. return -1;
  137. }
  138. static ggml_gallocr_ptr allocate_graph(ggml_cgraph * graph, ggml_tensor * out, ggml_backend_buffer_type_t buft) {
  139. ggml_set_output(out);
  140. ggml_build_forward_expand(graph, out);
  141. ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new(buft));
  142. bool result = ggml_gallocr_alloc_graph(galloc.get(), graph);
  143. GGML_ASSERT(result);
  144. return galloc;
  145. }
  146. //
  147. // correctness checks for result allocations
  148. static void check_all_allocated(ggml_cgraph * graph) {
  149. for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
  150. ggml_tensor * t = ggml_graph_node(graph, i);
  151. GGML_ASSERT(t->buffer != nullptr);
  152. GGML_ASSERT(t->data != nullptr);
  153. }
  154. }
  155. static void check_max_size(ggml_context * ctx) {
  156. for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
  157. auto buft = ggml_backend_buffer_get_type(t->buffer);
  158. size_t max_size = ggml_backend_buft_get_max_size(buft);
  159. size_t offset = (char *) t->data - (char *) ggml_backend_buffer_get_base(t->buffer);
  160. GGML_ASSERT(t->data >= ggml_backend_buffer_get_base(t->buffer));
  161. GGML_ASSERT((size_t) offset + ggml_nbytes(t) <= max_size);
  162. }
  163. }
  164. static bool can_reuse_memory(ggml_cgraph * graph, int current_i, ggml_tensor * current, ggml_tensor * other) {
  165. if (other->flags & GGML_TENSOR_FLAG_OUTPUT) {
  166. return false;
  167. }
  168. // Check if `other` is still "alive", ie. an input to any node after the `current` op
  169. for (int i = current_i; i < ggml_graph_n_nodes(graph); ++i) {
  170. ggml_tensor * t = ggml_graph_node(graph, i);
  171. for (int s = 0; s < GGML_MAX_SRC; s++) {
  172. if (t == current && ggml_op_can_inplace(t->op)) {
  173. continue;
  174. }
  175. if (t->src[s] == other) {
  176. return false;
  177. }
  178. if (t->src[s] && t->src[s]->view_src == other) {
  179. return false;
  180. }
  181. }
  182. }
  183. return true;
  184. }
  185. static bool memory_overlap(ggml_tensor * a, ggml_tensor * b) {
  186. if (a->buffer != b->buffer) {
  187. return false;
  188. }
  189. int64_t a0 = (int64_t) a->data;
  190. int64_t a1 = a0 + ggml_nbytes(a);
  191. int64_t b0 = (int64_t) b->data;
  192. int64_t b1 = b0 + ggml_nbytes(b);
  193. return a1 > b0 && b1 > a0;
  194. }
  195. static ggml_tensor * get_view_source(ggml_tensor * t) {
  196. while (t->view_src) {
  197. t = t->view_src;
  198. }
  199. return t;
  200. }
  201. static void check_no_overlap(ggml_cgraph * graph) {
  202. for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
  203. for (int j = 0; j < i; ++j) {
  204. ggml_tensor * t = ggml_graph_node(graph, i);
  205. ggml_tensor * o = ggml_graph_node(graph, j);
  206. GGML_ASSERT(t != o);
  207. if (get_view_source(t) == get_view_source(o)) {
  208. continue;
  209. }
  210. if (memory_overlap(t, o)) {
  211. GGML_ASSERT(can_reuse_memory(graph, i, t, o));
  212. }
  213. }
  214. }
  215. }
  216. //
  217. // test cases
  218. // Scenario where the first backend buffer is completely exhausted and there are further
  219. // tensors which require a second buffer
  220. static void test_max_size_too_many_tensors() {
  221. dummy_backend backend = dummy_backend_init(16);
  222. auto [ctx, graph, ctx_ptr] = make_context();
  223. ggml_tensor * x[7];
  224. x[0] = make_input_with_size(ctx, 8);
  225. x[1] = make_input_with_size(ctx, 8);
  226. x[2] = make_input_with_size(ctx, 8);
  227. x[3] = ggml_mul(ctx, x[0], x[1]);
  228. x[4] = ggml_add(ctx, x[1], x[2]);
  229. x[5] = ggml_add(ctx, x[3], x[0]);
  230. x[6] = ggml_add(ctx, x[4], x[5]);
  231. assign_names(ctx);
  232. ggml_gallocr_ptr galloc = allocate_graph(graph, x[6], &backend.buffer_type);
  233. check_all_allocated(graph);
  234. check_no_overlap(graph);
  235. check_max_size(ctx);
  236. GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
  237. }
  238. // Scenario where there is some space left in the first buffer, but not enough to accomodate
  239. // a larger tensor, so a second buffer is required
  240. static void test_max_size_tensor_too_large() {
  241. dummy_backend backend = dummy_backend_init(32);
  242. auto [ctx, graph, ctx_ptr] = make_context();
  243. ggml_tensor * x[3];
  244. x[0] = make_input_with_size(ctx, 16); // chunk 0, [0 , 16)
  245. x[1] = make_input_with_size(ctx, 8); // chunk 0, [16, 24)
  246. x[2] = ggml_concat(ctx, x[0], x[1], 0); // chunk 1, [0 , 24)
  247. assign_names(ctx);
  248. ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
  249. check_all_allocated(graph);
  250. check_no_overlap(graph);
  251. check_max_size(ctx);
  252. GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
  253. }
  254. // Scenario where a single tensor exceeds the max buffer size - in this case the allocator
  255. // should try to create a bigger buffer anyway, and wait for the backend to throw an error.
  256. // Backends may report an artificially lower max size in some cases for compatibility reasons.
  257. static void test_tensor_larger_than_max_size() {
  258. dummy_backend backend = dummy_backend_init(16);
  259. auto [ctx, graph, ctx_ptr] = make_context();
  260. ggml_tensor * x[2];
  261. x[0] = make_input_with_size(ctx, 24);
  262. x[1] = ggml_scale(ctx, x[0], 2.0f);
  263. assign_names(ctx);
  264. ggml_gallocr_ptr galloc = allocate_graph(graph, x[1], &backend.buffer_type);
  265. check_all_allocated(graph);
  266. check_no_overlap(graph);
  267. GGML_ASSERT(backend.context->allocated_total() == 24);
  268. }
  269. // This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
  270. // require more. Expectation is that the last buffer should grow to fit everything,
  271. // leaving it to the backend to error out if it can't allocate that much.
  272. static void test_not_enough_chunks() {
  273. const int max_chunks = 16;
  274. const int max_size = 8;
  275. dummy_backend backend = dummy_backend_init(max_size);
  276. auto [ctx, graph, ctx_ptr] = make_context();
  277. ggml_tensor * x[max_chunks + 1];
  278. for (int i = 0; i < max_chunks + 1; ++i) {
  279. x[i] = make_input_with_size(ctx, max_size);
  280. }
  281. ggml_tensor * acc = x[0];
  282. for (int i = 0; i < max_chunks; ++i) {
  283. acc = ggml_add(ctx, acc, x[i + 1]);
  284. }
  285. assign_names(ctx);
  286. ggml_gallocr_ptr galloc = allocate_graph(graph, acc, &backend.buffer_type);
  287. check_all_allocated(graph);
  288. check_no_overlap(graph);
  289. GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
  290. }
  291. // Fill up leftover unallocated space of a chunk after allocating a large tensor that
  292. // requires a new chunk.
  293. static void test_fill_leftover_space() {
  294. dummy_backend backend = dummy_backend_init(16);
  295. auto [ctx, graph, ctx_ptr] = make_context();
  296. ggml_tensor * x[4];
  297. x[0] = make_input_with_size(ctx, 8);
  298. x[1] = ggml_pad(ctx, x[0], 2, 0, 0, 0);
  299. x[3] = ggml_mean(ctx, x[1]);
  300. assign_names(ctx);
  301. ggml_gallocr_ptr galloc = allocate_graph(graph, x[3], &backend.buffer_type);
  302. check_all_allocated(graph);
  303. check_no_overlap(graph);
  304. check_max_size(ctx);
  305. GGML_ASSERT(backend.context->allocated_total() <= 12 + 16);
  306. }
  307. // Check that views don't require any extra memory
  308. static void test_view_inplace() {
  309. dummy_backend backend = dummy_backend_init(32);
  310. auto [ctx, graph, ctx_ptr] = make_context();
  311. ggml_tensor * x[6];
  312. x[0] = make_input_1d(ctx, 4); // chunk 0, [0, 16)
  313. x[1] = ggml_reshape_2d(ctx, x[0], 2, 2); // view of x0
  314. x[2] = ggml_permute(ctx, x[1], 1, 0, 2, 3); // view of x0
  315. x[3] = ggml_view_1d(ctx, x[2], 2, 4); // view of x0
  316. x[4] = make_input_1d(ctx, 2); // chunk 0, [16, 24)
  317. x[5] = ggml_add(ctx, x[3], x[4]); // reuse (inplace add)
  318. assign_names(ctx);
  319. ggml_gallocr_ptr galloc = allocate_graph(graph, x[5], &backend.buffer_type);
  320. check_all_allocated(graph);
  321. check_no_overlap(graph);
  322. check_max_size(ctx);
  323. GGML_ASSERT(backend.context->allocated_total() <= 24);
  324. }
  325. static void test_reuse_and_free() {
  326. dummy_backend backend = dummy_backend_init(40);
  327. auto [ctx, graph, ctx_ptr] = make_context();
  328. ggml_tensor * x[9];
  329. x[0] = make_input_with_size(ctx, 24);
  330. x[1] = make_input_with_size(ctx, 8);
  331. x[2] = make_input_with_size(ctx, 8);
  332. x[3] = ggml_add(ctx, x[1], x[2]); // reuse, free x2
  333. x[4] = ggml_pad(ctx, x[0], 2, 0, 0, 0); // alloc new buffer, free x0
  334. x[5] = ggml_scale(ctx, x[4], 2.0f); // alloc from free block
  335. x[6] = ggml_add(ctx, x[4], x[5]); // reuse, free x5
  336. x[7] = ggml_view_1d(ctx, x[6], 2, 8); // view
  337. x[8] = ggml_add(ctx, x[3], x[7]); // reuse
  338. assign_names(ctx);
  339. ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
  340. check_all_allocated(graph);
  341. check_no_overlap(graph);
  342. check_max_size(ctx);
  343. GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
  344. }
  345. static void test_merge_free_block(size_t max_buffer_size) {
  346. dummy_backend backend = dummy_backend_init(max_buffer_size);
  347. auto [ctx, graph, ctx_ptr] = make_context();
  348. ggml_tensor * x[9];
  349. x[0] = make_input_with_size(ctx, 16);
  350. x[1] = make_input_with_size(ctx, 16);
  351. x[2] = make_input_with_size(ctx, 16);
  352. x[3] = ggml_mean(ctx, x[0]);
  353. x[4] = ggml_mean(ctx, x[1]);
  354. x[5] = ggml_pad(ctx, x[2], 2, 0, 0, 0);
  355. x[6] = ggml_add(ctx, x[3], x[4]);
  356. x[7] = ggml_pad(ctx, x[6], 5, 0, 0, 0);
  357. x[8] = ggml_add(ctx, x[5], x[7]);
  358. assign_names(ctx);
  359. ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
  360. check_all_allocated(graph);
  361. check_no_overlap(graph);
  362. check_max_size(ctx);
  363. GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
  364. }
  365. // Check that previously allocated but freed memory is preferred over allocating
  366. // additional memory, even if the remaining space in a chunk would match tensor size better
  367. static void test_prefer_already_allocated_memory() {
  368. dummy_backend backend = dummy_backend_init(32, /*align*/ 4);
  369. auto [ctx, graph, ctx_ptr] = make_context();
  370. ggml_tensor * x[3];
  371. x[0] = make_input_with_size(ctx, 24); // [24b][8b unused]
  372. x[1] = ggml_mean(ctx, x[0]); // [24b free][4b][4b unused]
  373. x[2] = ggml_mean(ctx, x[1]); // should be allocated in the 24b block
  374. assign_names(ctx);
  375. ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
  376. check_all_allocated(graph);
  377. check_no_overlap(graph);
  378. GGML_ASSERT(backend.context->allocated_total() <= 28);
  379. }
  380. // test for allocating on multiple devices with some tensors in the graph
  381. // allocated externally (not by gallocr).
  382. static void test_multiple_buffer_types() {
  383. dummy_backend backend_a = dummy_backend_init(32);
  384. dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
  385. auto [ctx_a, _a, ctx_a_ptr] = make_context();
  386. auto [ctx_b, _b, ctx_b_ptr] = make_context();
  387. auto [ctx, graph, ctx_ptr] = make_context();
  388. ggml_tensor * a[2];
  389. a[0] = make_input_with_size(ctx_a, 16);
  390. a[1] = make_input_with_size(ctx_a, 16);
  391. assign_names(ctx_a, "a");
  392. ggml_tensor * b[2];
  393. b[0] = make_input_with_size(ctx_b, 24);
  394. b[1] = make_input_with_size(ctx_b, 4);
  395. assign_names(ctx_b, "b");
  396. ggml_tensor * x[9];
  397. x[0] = make_input_with_size(ctx, 16);
  398. x[1] = ggml_mul(ctx, x[0], a[0]);
  399. x[2] = ggml_pad(ctx, x[1], 2, 0, 0, 0);
  400. x[3] = ggml_mul(ctx, x[2], b[0]);
  401. x[4] = ggml_mean(ctx, x[3]);
  402. x[5] = ggml_add(ctx, x[4], b[1]);
  403. x[6] = ggml_pad(ctx, x[5], 3, 0, 0, 0);
  404. x[7] = ggml_add(ctx, x[6], a[1]);
  405. x[8] = ggml_scale(ctx, x[7], 2.0f);
  406. assign_names(ctx, "x");
  407. ggml_backend_buffer_ptr buf_a(ggml_backend_alloc_ctx_tensors_from_buft(ctx_a, &backend_a.buffer_type));
  408. ggml_backend_buffer_ptr buf_b(ggml_backend_alloc_ctx_tensors_from_buft(ctx_b, &backend_b.buffer_type));
  409. ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
  410. // assign buffer types manually to avoid extra complexity from backend scheduler
  411. ggml_set_output(x[8]);
  412. ggml_build_forward_expand(graph, x[8]);
  413. GGML_ASSERT(graph->n_leafs == 5);
  414. int leaf_buffer_ids[5];
  415. leaf_buffer_ids[get_leaf_id(graph, "a0")] = 0;
  416. leaf_buffer_ids[get_leaf_id(graph, "a1")] = 0;
  417. leaf_buffer_ids[get_leaf_id(graph, "b0")] = 1;
  418. leaf_buffer_ids[get_leaf_id(graph, "b1")] = 1;
  419. leaf_buffer_ids[get_leaf_id(graph, "x0")] = 0;
  420. GGML_ASSERT(graph->n_nodes == 8);
  421. int node_buffer_ids[8];
  422. node_buffer_ids[get_node_id(graph, "x1")] = 0;
  423. node_buffer_ids[get_node_id(graph, "x2")] = 0;
  424. node_buffer_ids[get_node_id(graph, "x3")] = 1;
  425. node_buffer_ids[get_node_id(graph, "x4")] = 1;
  426. node_buffer_ids[get_node_id(graph, "x5")] = 1;
  427. node_buffer_ids[get_node_id(graph, "x6")] = 1;
  428. node_buffer_ids[get_node_id(graph, "x7")] = 0;
  429. node_buffer_ids[get_node_id(graph, "x8")] = 0;
  430. ggml_gallocr_ptr galloc(ggml_gallocr_new_n(bufts, 2));
  431. ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
  432. ggml_gallocr_alloc_graph(galloc.get(), graph);
  433. check_all_allocated(graph);
  434. check_no_overlap(graph);
  435. check_max_size(ctx);
  436. GGML_ASSERT(backend_a.context->allocated_total() <= 32 + 32 + 24);
  437. GGML_ASSERT(backend_b.context->allocated_total() <= 32 + 24);
  438. }
  439. static void test_buffer_size_zero() {
  440. dummy_backend backend_a = dummy_backend_init(SIZE_MAX);
  441. dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
  442. auto [ctx, graph, ctx_ptr] = make_context();
  443. ggml_tensor * x[2];
  444. x[0] = make_input_with_size(ctx, 16);
  445. x[1] = ggml_scale(ctx, x[0], 2.0f);
  446. ggml_set_output(x[1]);
  447. ggml_build_forward_expand(graph, x[1]);
  448. int leaf_buffer_ids[1] = { 0 };
  449. int node_buffer_ids[1] = { 0 };
  450. ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
  451. ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
  452. bool res1 = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
  453. bool res2 = ggml_gallocr_alloc_graph(galloc.get(), graph);
  454. GGML_ASSERT(res1 && res2);
  455. check_all_allocated(graph);
  456. GGML_ASSERT(backend_a.context->allocated_total() == 16);
  457. GGML_ASSERT(backend_b.context->allocated_total() == 0);
  458. }
  459. // Test re-using gallocr for a different graph. The new graph has the same
  460. // total size, but one of the chunks is larger, so reallocation is required.
  461. static void test_reallocation() {
  462. dummy_backend backend = dummy_backend_init(32, /*align*/ 4);
  463. ggml_gallocr_ptr galloc;
  464. {
  465. auto [ctx, graph, ctx_ptr] = make_context();
  466. ggml_tensor * x[4];
  467. x[0] = make_input_with_size(ctx, 24);
  468. x[1] = make_input_with_size(ctx, 16);
  469. x[2] = ggml_view_1d(ctx, x[0], 4, 0);
  470. x[3] = ggml_add(ctx, x[2], x[1]);
  471. assign_names(ctx);
  472. galloc = allocate_graph(graph, x[3], &backend.buffer_type);
  473. check_all_allocated(graph);
  474. GGML_ASSERT(backend.context->allocated_total() == 40);
  475. }
  476. {
  477. auto [ctx, graph, ctx_ptr] = make_context();
  478. ggml_tensor * x[3];
  479. x[0] = make_input_with_size(ctx, 20);
  480. x[1] = make_input_with_size(ctx, 20);
  481. x[2] = ggml_add(ctx, x[0], x[1]);
  482. assign_names(ctx);
  483. ggml_set_output(x[2]);
  484. ggml_build_forward_expand(graph, x[2]);
  485. bool result = ggml_gallocr_alloc_graph(galloc.get(), graph);
  486. GGML_ASSERT(result);
  487. check_all_allocated(graph);
  488. GGML_ASSERT(backend.context->allocated_total() == 40);
  489. }
  490. }
  491. static void run(const char * name, void (*f)()) {
  492. printf("%s ", name);
  493. fflush(stdout);
  494. f();
  495. printf("PASSED\n");
  496. }
  497. int main() {
  498. run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
  499. run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
  500. run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
  501. run("test_not_enough_chunks", test_not_enough_chunks);
  502. run("test_fill_leftover_space", test_fill_leftover_space);
  503. run("test_view_inplace", test_view_inplace);
  504. run("test_reuse_and_free", test_reuse_and_free);
  505. run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
  506. run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
  507. run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
  508. run("test_multiple_buffer_types", test_multiple_buffer_types);
  509. run("test_buffer_size_zero", test_buffer_size_zero);
  510. run("test_reallocation", test_reallocation);
  511. return 0;
  512. }