ggml-backend.c 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431
  1. #include "ggml-backend-impl.h"
  2. #include "ggml-alloc.h"
  3. #include "ggml-impl.h"
  4. #include <assert.h>
  5. #include <limits.h>
  6. #include <stdarg.h>
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <string.h>
  10. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  11. // backend buffer type
  12. ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  13. return buft->iface.alloc_buffer(buft, size);
  14. }
  15. size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
  16. return buft->iface.get_alignment(buft);
  17. }
  18. size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
  19. // get_alloc_size is optional, defaults to ggml_nbytes
  20. if (buft->iface.get_alloc_size) {
  21. return buft->iface.get_alloc_size(buft, tensor);
  22. }
  23. return ggml_nbytes(tensor);
  24. }
  25. bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
  26. return buft->iface.supports_backend(buft, backend);
  27. }
  28. bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
  29. if (buft->iface.is_host) {
  30. return buft->iface.is_host(buft);
  31. }
  32. return false;
  33. }
  34. // backend buffer
  35. ggml_backend_buffer_t ggml_backend_buffer_init(
  36. ggml_backend_buffer_type_t buft,
  37. struct ggml_backend_buffer_i iface,
  38. ggml_backend_buffer_context_t context,
  39. size_t size) {
  40. ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
  41. GGML_ASSERT(iface.get_base != NULL);
  42. (*buffer) = (struct ggml_backend_buffer) {
  43. /* .interface = */ iface,
  44. /* .buft = */ buft,
  45. /* .context = */ context,
  46. /* .size = */ size,
  47. };
  48. return buffer;
  49. }
  50. void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
  51. if (buffer == NULL) {
  52. return;
  53. }
  54. if (buffer->iface.free_buffer != NULL) {
  55. buffer->iface.free_buffer(buffer);
  56. }
  57. free(buffer);
  58. }
  59. size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
  60. return buffer->size;
  61. }
  62. void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
  63. void * base = buffer->iface.get_base(buffer);
  64. GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
  65. return base;
  66. }
  67. void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  68. // init_tensor is optional
  69. if (buffer->iface.init_tensor) {
  70. buffer->iface.init_tensor(buffer, tensor);
  71. }
  72. }
  73. size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
  74. return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
  75. }
  76. size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  77. return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
  78. }
  79. void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
  80. buffer->iface.clear(buffer, value);
  81. }
  82. bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
  83. return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
  84. }
  85. ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
  86. return buffer->buft;
  87. }
  88. // backend
  89. const char * ggml_backend_name(ggml_backend_t backend) {
  90. if (backend == NULL) {
  91. return "NULL";
  92. }
  93. return backend->iface.get_name(backend);
  94. }
  95. void ggml_backend_free(ggml_backend_t backend) {
  96. if (backend == NULL) {
  97. return;
  98. }
  99. backend->iface.free(backend);
  100. }
  101. ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
  102. return backend->iface.get_default_buffer_type(backend);
  103. }
  104. ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
  105. return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
  106. }
  107. size_t ggml_backend_get_alignment(ggml_backend_t backend) {
  108. return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
  109. }
  110. void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  111. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  112. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
  113. backend->iface.set_tensor_async(backend, tensor, data, offset, size);
  114. }
  115. void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  116. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  117. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
  118. backend->iface.get_tensor_async(backend, tensor, data, offset, size);
  119. }
  120. void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  121. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  122. GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
  123. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
  124. tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
  125. }
  126. void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  127. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  128. GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
  129. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
  130. tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
  131. }
  132. void ggml_backend_synchronize(ggml_backend_t backend) {
  133. if (backend->iface.synchronize == NULL) {
  134. return;
  135. }
  136. backend->iface.synchronize(backend);
  137. }
  138. ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  139. return backend->iface.graph_plan_create(backend, cgraph);
  140. }
  141. void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  142. backend->iface.graph_plan_free(backend, plan);
  143. }
  144. void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  145. backend->iface.graph_plan_compute(backend, plan);
  146. // TODO: optional sync
  147. ggml_backend_synchronize(backend);
  148. }
  149. bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  150. if (!backend->iface.graph_compute(backend, cgraph)) {
  151. return false;
  152. }
  153. // TODO: optional sync
  154. ggml_backend_synchronize(backend);
  155. return true;
  156. }
  157. bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
  158. return backend->iface.supports_op(backend, op);
  159. }
  160. // backend copy
  161. static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
  162. if (a->type != b->type) {
  163. return false;
  164. }
  165. for (int i = 0; i < GGML_MAX_DIMS; i++) {
  166. if (a->ne[i] != b->ne[i]) {
  167. return false;
  168. }
  169. if (a->nb[i] != b->nb[i]) {
  170. return false;
  171. }
  172. }
  173. return true;
  174. }
  175. void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
  176. //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
  177. //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
  178. GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
  179. // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
  180. if (src == dst) {
  181. return;
  182. }
  183. // TODO: allow backends to support copy to/from same backend
  184. if (dst->buffer->iface.cpy_tensor_from != NULL) {
  185. dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
  186. } else if (src->buffer->iface.cpy_tensor_to != NULL) {
  187. src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
  188. } else {
  189. // shouldn't be hit when copying from/to CPU
  190. #ifndef NDEBUG
  191. fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
  192. "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
  193. #endif
  194. size_t nbytes = ggml_nbytes(src);
  195. void * data = malloc(nbytes);
  196. ggml_backend_tensor_get(src, data, 0, nbytes);
  197. ggml_backend_tensor_set(dst, data, 0, nbytes);
  198. free(data);
  199. }
  200. }
  201. // backend registry
  202. #define GGML_MAX_BACKENDS_REG 16
  203. struct ggml_backend_reg {
  204. char name[128];
  205. ggml_backend_init_fn init_fn;
  206. ggml_backend_buffer_type_t default_buffer_type;
  207. void * user_data;
  208. };
  209. static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
  210. static size_t ggml_backend_registry_count = 0;
  211. static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
  212. static void ggml_backend_registry_init(void) {
  213. static bool initialized = false;
  214. if (initialized) {
  215. return;
  216. }
  217. initialized = true;
  218. ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
  219. // add forward decls here to avoid including the backend headers
  220. #ifdef GGML_USE_CUBLAS
  221. extern void ggml_backend_cuda_reg_devices(void);
  222. ggml_backend_cuda_reg_devices();
  223. #endif
  224. #ifdef GGML_USE_METAL
  225. extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
  226. extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
  227. ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
  228. #endif
  229. }
  230. void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
  231. GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
  232. size_t id = ggml_backend_registry_count;
  233. ggml_backend_registry[id] = (struct ggml_backend_reg) {
  234. /* .name = */ {0},
  235. /* .fn = */ init_fn,
  236. /* .default_buffer_type = */ default_buffer_type,
  237. /* .user_data = */ user_data,
  238. };
  239. snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
  240. #ifndef NDEBUG
  241. fprintf(stderr, "%s: registered backend %s\n", __func__, name);
  242. #endif
  243. ggml_backend_registry_count++;
  244. }
  245. size_t ggml_backend_reg_get_count(void) {
  246. ggml_backend_registry_init();
  247. return ggml_backend_registry_count;
  248. }
  249. size_t ggml_backend_reg_find_by_name(const char * name) {
  250. ggml_backend_registry_init();
  251. for (size_t i = 0; i < ggml_backend_registry_count; i++) {
  252. // TODO: case insensitive in a portable way
  253. if (strcmp(ggml_backend_registry[i].name, name) == 0) {
  254. return i;
  255. }
  256. }
  257. // not found
  258. return SIZE_MAX;
  259. }
  260. // init from backend:params string
  261. ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
  262. ggml_backend_registry_init();
  263. const char * params = strchr(backend_str, ':');
  264. char backend_name[128];
  265. if (params == NULL) {
  266. snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
  267. params = "";
  268. } else {
  269. snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
  270. params++;
  271. }
  272. size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
  273. if (backend_i == SIZE_MAX) {
  274. fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
  275. return NULL;
  276. }
  277. return ggml_backend_reg_init_backend(backend_i, params);
  278. }
  279. const char * ggml_backend_reg_get_name(size_t i) {
  280. ggml_backend_registry_init();
  281. GGML_ASSERT(i < ggml_backend_registry_count);
  282. return ggml_backend_registry[i].name;
  283. }
  284. ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
  285. ggml_backend_registry_init();
  286. GGML_ASSERT(i < ggml_backend_registry_count);
  287. return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
  288. }
  289. ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
  290. ggml_backend_registry_init();
  291. GGML_ASSERT(i < ggml_backend_registry_count);
  292. return ggml_backend_registry[i].default_buffer_type;
  293. }
  294. ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
  295. ggml_backend_registry_init();
  296. GGML_ASSERT(i < ggml_backend_registry_count);
  297. return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
  298. }
  299. // backend CPU
  300. static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
  301. return (void *)buffer->context;
  302. }
  303. static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
  304. free(buffer->context);
  305. }
  306. static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  307. memcpy((char *)tensor->data + offset, data, size);
  308. GGML_UNUSED(buffer);
  309. }
  310. static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  311. memcpy(data, (const char *)tensor->data + offset, size);
  312. GGML_UNUSED(buffer);
  313. }
  314. static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
  315. ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
  316. GGML_UNUSED(buffer);
  317. }
  318. static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
  319. ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
  320. GGML_UNUSED(buffer);
  321. }
  322. static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
  323. memset(buffer->context, value, buffer->size);
  324. }
  325. static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
  326. /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
  327. /* .get_base = */ ggml_backend_cpu_buffer_get_base,
  328. /* .init_tensor = */ NULL, // no initialization required
  329. /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
  330. /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
  331. /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
  332. /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
  333. /* .clear = */ ggml_backend_cpu_buffer_clear,
  334. };
  335. // for buffers from ptr, free is not called
  336. static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
  337. /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
  338. /* .get_base = */ ggml_backend_cpu_buffer_get_base,
  339. /* .init_tensor = */ NULL, // no initialization required
  340. /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
  341. /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
  342. /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
  343. /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
  344. /* .clear = */ ggml_backend_cpu_buffer_clear,
  345. };
  346. static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
  347. static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  348. size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
  349. void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
  350. GGML_ASSERT(data != NULL && "failed to allocate buffer");
  351. return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
  352. }
  353. static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
  354. return TENSOR_ALIGNMENT;
  355. GGML_UNUSED(buft);
  356. }
  357. static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
  358. return ggml_backend_is_cpu(backend);
  359. GGML_UNUSED(buft);
  360. }
  361. static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
  362. return true;
  363. GGML_UNUSED(buft);
  364. }
  365. ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
  366. static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
  367. /* .iface = */ {
  368. /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
  369. /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
  370. /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
  371. /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
  372. /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
  373. },
  374. /* .context = */ NULL,
  375. };
  376. return &ggml_backend_cpu_buffer_type;
  377. }
  378. #ifdef GGML_USE_CPU_HBM
  379. // buffer type HBM
  380. #include <hbwmalloc.h>
  381. static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
  382. hbw_free(buffer->context);
  383. }
  384. static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  385. //void * ptr = hbw_malloc(size);
  386. void * ptr;
  387. int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
  388. if (result != 0) {
  389. fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
  390. return NULL;
  391. }
  392. // FIXME: this is a hack to avoid having to implement a new buffer type
  393. ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
  394. buffer->buft = buft;
  395. buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
  396. return buffer;
  397. }
  398. ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
  399. static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
  400. /* .iface = */ {
  401. /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
  402. /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
  403. /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
  404. /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
  405. /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
  406. },
  407. /* .context = */ NULL,
  408. };
  409. return &ggml_backend_cpu_buffer_type_hbm;
  410. }
  411. #endif
  412. struct ggml_backend_cpu_context {
  413. int n_threads;
  414. void * work_data;
  415. size_t work_size;
  416. };
  417. static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
  418. return "CPU";
  419. GGML_UNUSED(backend);
  420. }
  421. static void ggml_backend_cpu_free(ggml_backend_t backend) {
  422. struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
  423. free(cpu_ctx->work_data);
  424. free(cpu_ctx);
  425. free(backend);
  426. }
  427. static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
  428. return ggml_backend_cpu_buffer_type();
  429. GGML_UNUSED(backend);
  430. }
  431. struct ggml_backend_plan_cpu {
  432. struct ggml_cplan cplan;
  433. struct ggml_cgraph cgraph;
  434. };
  435. static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  436. struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
  437. struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
  438. cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
  439. cpu_plan->cgraph = *cgraph; // FIXME: deep copy
  440. if (cpu_plan->cplan.work_size > 0) {
  441. cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
  442. }
  443. return cpu_plan;
  444. }
  445. static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  446. struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
  447. free(cpu_plan->cplan.work_data);
  448. free(cpu_plan);
  449. GGML_UNUSED(backend);
  450. }
  451. static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  452. struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
  453. ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
  454. GGML_UNUSED(backend);
  455. }
  456. static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  457. struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
  458. struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
  459. if (cpu_ctx->work_size < cplan.work_size) {
  460. // TODO: may be faster to free and use malloc to avoid the copy
  461. cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
  462. cpu_ctx->work_size = cplan.work_size;
  463. }
  464. cplan.work_data = cpu_ctx->work_data;
  465. ggml_graph_compute(cgraph, &cplan);
  466. return true;
  467. }
  468. static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
  469. switch (op->op) {
  470. case GGML_OP_MUL_MAT:
  471. return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
  472. default:
  473. return true;
  474. }
  475. GGML_UNUSED(backend);
  476. }
  477. static struct ggml_backend_i cpu_backend_i = {
  478. /* .get_name = */ ggml_backend_cpu_name,
  479. /* .free = */ ggml_backend_cpu_free,
  480. /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
  481. /* .set_tensor_async = */ NULL,
  482. /* .get_tensor_async = */ NULL,
  483. /* .cpy_tensor_from_async = */ NULL,
  484. /* .cpy_tensor_to_async = */ NULL,
  485. /* .synchronize = */ NULL,
  486. /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
  487. /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
  488. /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
  489. /* .graph_compute = */ ggml_backend_cpu_graph_compute,
  490. /* .supports_op = */ ggml_backend_cpu_supports_op,
  491. };
  492. ggml_backend_t ggml_backend_cpu_init(void) {
  493. struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
  494. ctx->n_threads = GGML_DEFAULT_N_THREADS;
  495. ctx->work_data = NULL;
  496. ctx->work_size = 0;
  497. ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
  498. *cpu_backend = (struct ggml_backend) {
  499. /* .interface = */ cpu_backend_i,
  500. /* .context = */ ctx
  501. };
  502. return cpu_backend;
  503. }
  504. bool ggml_backend_is_cpu(ggml_backend_t backend) {
  505. return backend->iface.get_name == ggml_backend_cpu_name;
  506. }
  507. void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
  508. GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
  509. struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
  510. ctx->n_threads = n_threads;
  511. }
  512. ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
  513. return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
  514. }
  515. static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
  516. return ggml_backend_cpu_init();
  517. GGML_UNUSED(params);
  518. GGML_UNUSED(user_data);
  519. }
  520. // scheduler
  521. #define GGML_MAX_BACKENDS 4
  522. #define GGML_MAX_SPLITS 256
  523. #define GGML_MAX_SPLIT_INPUTS 16
  524. struct ggml_backend_sched_split {
  525. ggml_tallocr_t tallocr;
  526. int i_start;
  527. int i_end;
  528. struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
  529. int n_inputs;
  530. struct ggml_cgraph graph;
  531. };
  532. struct ggml_backend_sched {
  533. int n_backends;
  534. ggml_backend_t backends[GGML_MAX_BACKENDS];
  535. ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
  536. ggml_gallocr_t galloc;
  537. struct ggml_hash_set hash_set;
  538. ggml_tallocr_t * node_talloc; // [hash_set.size]
  539. struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
  540. struct ggml_cgraph * graph;
  541. struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
  542. int n_splits;
  543. struct ggml_context * ctx;
  544. // align context_buffer to GGML_MEM_ALIGN
  545. #ifdef _MSC_VER
  546. __declspec(align(GGML_MEM_ALIGN))
  547. #else
  548. __attribute__((aligned(GGML_MEM_ALIGN)))
  549. #endif
  550. char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
  551. };
  552. #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
  553. #define node_allocr(node) sched->node_talloc[hash_id(node)]
  554. static bool ggml_is_view_op(enum ggml_op op) {
  555. return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
  556. }
  557. // returns the priority of the backend, lower is better
  558. static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
  559. for (int i = 0; i < sched->n_backends; i++) {
  560. if (sched->backends[i] == backend) {
  561. return i;
  562. }
  563. }
  564. return INT_MAX;
  565. }
  566. static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
  567. for (int i = 0; i < sched->n_backends; i++) {
  568. if (sched->tallocs[i] == allocr) {
  569. return i;
  570. }
  571. }
  572. return INT_MAX;
  573. }
  574. static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
  575. if (buffer == NULL) {
  576. return NULL;
  577. }
  578. // find highest prio backend that supports the buffer type
  579. for (int i = 0; i < sched->n_backends; i++) {
  580. if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
  581. return sched->backends[i];
  582. }
  583. }
  584. GGML_ASSERT(false && "tensor buffer type not supported by any backend");
  585. }
  586. static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
  587. if (allocr == NULL) {
  588. return NULL;
  589. }
  590. // find highest prio backend that supports the buffer type
  591. for (int i = 0; i < sched->n_backends; i++) {
  592. if (sched->tallocs[i] == allocr) {
  593. return sched->backends[i];
  594. }
  595. }
  596. GGML_UNREACHABLE();
  597. }
  598. #if 0
  599. static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
  600. #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
  601. #define GET_CAUSE(node) causes[hash_id(node)]
  602. #else
  603. #define SET_CAUSE(node, ...)
  604. #define GET_CAUSE(node) ""
  605. #endif
  606. // returns the backend that should be used for the node based on the current locations
  607. static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
  608. // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
  609. // ie. kv cache updates
  610. // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
  611. // dst
  612. ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
  613. if (cur_backend != NULL) {
  614. SET_CAUSE(node, "1.dst");
  615. return cur_backend;
  616. }
  617. // view_src
  618. if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
  619. SET_CAUSE(node, "1.vsrc");
  620. return get_buffer_backend(sched, node->view_src->buffer);
  621. }
  622. // src
  623. int cur_prio = INT_MAX;
  624. size_t cur_size = 0;
  625. for (int i = 0; i < GGML_MAX_SRC; i++) {
  626. const struct ggml_tensor * src = node->src[i];
  627. if (src == NULL) {
  628. break;
  629. }
  630. ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
  631. if (src_backend != NULL) {
  632. int src_prio = sched_backend_prio(sched, src_backend);
  633. size_t src_size = ggml_nbytes(src);
  634. if (src_prio < cur_prio && src_size >= cur_size) {
  635. cur_prio = src_prio;
  636. cur_size = src_size;
  637. cur_backend = src_backend;
  638. SET_CAUSE(node, "1.src%d", i);
  639. }
  640. }
  641. }
  642. return cur_backend;
  643. }
  644. static char * fmt_size(size_t size) {
  645. static char buffer[128];
  646. if (size >= 1024*1024) {
  647. sprintf(buffer, "%zuM", size/1024/1024);
  648. } else {
  649. sprintf(buffer, "%zuK", size/1024);
  650. }
  651. return buffer;
  652. }
  653. static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
  654. int cur_split = 0;
  655. for (int i = 0; i < graph->n_nodes; i++) {
  656. if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
  657. ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
  658. fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
  659. sched->splits[cur_split].n_inputs);
  660. for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
  661. fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
  662. fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
  663. }
  664. fprintf(stderr, "\n");
  665. cur_split++;
  666. }
  667. struct ggml_tensor * node = graph->nodes[i];
  668. if (ggml_is_view_op(node->op)) {
  669. continue;
  670. }
  671. ggml_tallocr_t node_allocr = node_allocr(node);
  672. ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
  673. fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
  674. fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
  675. for (int j = 0; j < GGML_MAX_SRC; j++) {
  676. struct ggml_tensor * src = node->src[j];
  677. if (src == NULL) {
  678. break;
  679. }
  680. ggml_tallocr_t src_allocr = node_allocr(src);
  681. ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
  682. fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
  683. fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
  684. }
  685. fprintf(stderr, "\n");
  686. }
  687. }
  688. // creates a copy of the tensor with the same memory layout
  689. static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
  690. struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
  691. for (int i = 0; i < GGML_MAX_DIMS; i++) {
  692. dup->nb[i] = tensor->nb[i];
  693. }
  694. return dup;
  695. }
  696. // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
  697. // TODO: merge passes
  698. static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
  699. // reset state
  700. size_t hash_size = sched->hash_set.size;
  701. memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
  702. memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
  703. memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
  704. sched->n_splits = 0;
  705. struct ggml_init_params params = {
  706. /* .mem_size = */ sizeof(sched->context_buffer),
  707. /* .mem_buffer = */ sched->context_buffer,
  708. /* .no_alloc = */ true
  709. };
  710. if (sched->ctx != NULL) {
  711. ggml_free(sched->ctx);
  712. }
  713. sched->ctx = ggml_init(params);
  714. // pass 1: assign backends to ops with allocated inputs
  715. for (int i = 0; i < graph->n_leafs; i++) {
  716. struct ggml_tensor * leaf = graph->leafs[i];
  717. if (node_allocr(leaf) != NULL) {
  718. // do not overwrite user assignments
  719. continue;
  720. }
  721. ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
  722. if (leaf_backend == NULL && leaf->view_src != NULL) {
  723. leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
  724. }
  725. if (leaf_backend != NULL) {
  726. node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
  727. }
  728. }
  729. for (int i = 0; i < graph->n_nodes; i++) {
  730. struct ggml_tensor * node = graph->nodes[i];
  731. if (node_allocr(node) != NULL) {
  732. // do not overwrite user assignments
  733. continue;
  734. }
  735. ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
  736. if (node_backend != NULL) {
  737. node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
  738. }
  739. }
  740. //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
  741. // pass 2: assign backends to ops from current assignments
  742. // TODO:
  743. // - reuse sched_backend_from_cur
  744. for (int i = 0; i < graph->n_nodes; i++) {
  745. struct ggml_tensor * node = graph->nodes[i];
  746. ggml_tallocr_t node_allocr = node_allocr(node);
  747. if (node_allocr == NULL) {
  748. int cur_prio = INT_MAX;
  749. size_t cur_size = 0;
  750. for (int j = 0; j < GGML_MAX_SRC; j++) {
  751. struct ggml_tensor * src = node->src[j];
  752. if (src == NULL) {
  753. break;
  754. }
  755. ggml_tallocr_t src_allocr = node_allocr(src);
  756. if (src_allocr != NULL) {
  757. int src_prio = sched_allocr_prio(sched, src_allocr);
  758. size_t src_size = ggml_nbytes(src);
  759. if (src_prio < cur_prio && src_size >= cur_size) {
  760. cur_prio = src_prio;
  761. cur_size = src_size;
  762. node_allocr = src_allocr;
  763. SET_CAUSE(node, "2.src%d", j);
  764. }
  765. }
  766. }
  767. if (node_allocr != NULL) {
  768. node_allocr(node) = node_allocr;
  769. }
  770. }
  771. }
  772. //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
  773. // pass 3: assign backends to remaining src from dst (should only be leafs)
  774. for (int i = 0; i < graph->n_nodes; i++) {
  775. struct ggml_tensor * node = graph->nodes[i];
  776. ggml_tallocr_t node_allocr = node_allocr(node);
  777. for (int j = 0; j < GGML_MAX_SRC; j++) {
  778. struct ggml_tensor * src = node->src[j];
  779. if (src == NULL) {
  780. break;
  781. }
  782. ggml_tallocr_t src_allocr = node_allocr(src);
  783. if (src_allocr == NULL) {
  784. node_allocr(src) = node_allocr;
  785. }
  786. }
  787. }
  788. //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
  789. // pass 4: split graph, find tensors that need to be copied
  790. // TODO:
  791. // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
  792. // find first backend
  793. int cur_split = 0;
  794. for (int i = 0; i < graph->n_nodes; i++) {
  795. struct ggml_tensor * node = graph->nodes[i];
  796. if (node->view_src == NULL) {
  797. sched->splits[0].tallocr = node_allocr(node);
  798. break;
  799. }
  800. }
  801. sched->splits[0].i_start = 0;
  802. sched->splits[0].n_inputs = 0;
  803. memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
  804. ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
  805. size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
  806. for (int i = 0; i < graph->n_nodes; i++) {
  807. struct ggml_tensor * node = graph->nodes[i];
  808. if (ggml_is_view_op(node->op)) {
  809. continue;
  810. }
  811. ggml_tallocr_t node_allocr = node_allocr(node);
  812. if (node_allocr != cur_allocr) {
  813. sched->splits[cur_split].i_end = i;
  814. cur_split++;
  815. GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
  816. sched->splits[cur_split].tallocr = node_allocr;
  817. sched->splits[cur_split].i_start = i;
  818. sched->splits[cur_split].n_inputs = 0;
  819. memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
  820. cur_allocr = node_allocr;
  821. cur_backend_id = sched_allocr_prio(sched, cur_allocr);
  822. }
  823. // find inputs that are not on the same backend
  824. for (int j = 0; j < GGML_MAX_SRC; j++) {
  825. struct ggml_tensor * src = node->src[j];
  826. if (src == NULL) {
  827. break;
  828. }
  829. ggml_tallocr_t src_allocr = node_allocr(src);
  830. if (src_allocr != node_allocr) {
  831. int n_inputs = sched->splits[cur_split].n_inputs++;
  832. GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
  833. sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
  834. // create copies
  835. size_t id = hash_id(src);
  836. if (sched->node_copies[id][cur_backend_id] == NULL) {
  837. struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
  838. sched->node_copies[id][cur_backend_id] = tensor_copy;
  839. node_allocr(tensor_copy) = cur_allocr;
  840. ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
  841. ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
  842. }
  843. node->src[j] = sched->node_copies[id][cur_backend_id];
  844. }
  845. }
  846. }
  847. sched->splits[cur_split].i_end = graph->n_nodes;
  848. sched->n_splits = cur_split + 1;
  849. //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
  850. #if 1
  851. // sanity check: all sources should have the same backend as the node
  852. for (int i = 0; i < graph->n_nodes; i++) {
  853. struct ggml_tensor * node = graph->nodes[i];
  854. ggml_tallocr_t node_allocr = node_allocr(node);
  855. if (node_allocr == NULL) {
  856. fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
  857. }
  858. for (int j = 0; j < GGML_MAX_SRC; j++) {
  859. struct ggml_tensor * src = node->src[j];
  860. if (src == NULL) {
  861. break;
  862. }
  863. ggml_tallocr_t src_allocr = node_allocr(src);
  864. if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
  865. fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
  866. node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
  867. j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
  868. }
  869. }
  870. }
  871. #endif
  872. // create copies of the graph for each split
  873. // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
  874. struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
  875. for (int i = 0; i < sched->n_splits; i++) {
  876. struct ggml_backend_sched_split * split = &sched->splits[i];
  877. split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
  878. // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
  879. for (int j = 0; j < split->n_inputs; j++) {
  880. struct ggml_tensor * input = split->inputs[j];
  881. struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
  882. input_cpy->src[0] = input;
  883. graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
  884. }
  885. for (int j = split->i_start; j < split->i_end; j++) {
  886. graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
  887. }
  888. }
  889. sched->graph = graph_copy;
  890. }
  891. static void sched_alloc_splits(ggml_backend_sched_t sched) {
  892. ggml_gallocr_alloc_graph_n(
  893. sched->galloc,
  894. sched->graph,
  895. sched->hash_set,
  896. sched->node_talloc);
  897. }
  898. static void sched_compute_splits(ggml_backend_sched_t sched) {
  899. uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
  900. uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
  901. struct ggml_backend_sched_split * splits = sched->splits;
  902. for (int i = 0; i < sched->n_splits; i++) {
  903. struct ggml_backend_sched_split * split = &splits[i];
  904. ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
  905. int split_backend_id = sched_backend_prio(sched, split_backend);
  906. // copy the input tensors to the split backend
  907. uint64_t copy_start_us = ggml_time_us();
  908. for (int j = 0; j < split->n_inputs; j++) {
  909. struct ggml_tensor * input = split->inputs[j];
  910. struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
  911. if (input->buffer == NULL) {
  912. if (input->view_src == NULL) {
  913. fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
  914. exit(1);
  915. }
  916. // FIXME: may need to use the sched buffer instead
  917. ggml_backend_view_init(input->view_src->buffer, input);
  918. }
  919. if (input_cpy->buffer == NULL) {
  920. fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
  921. exit(1);
  922. }
  923. //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
  924. //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
  925. ggml_backend_tensor_copy(input, input_cpy);
  926. }
  927. // ggml_backend_synchronize(split_backend);
  928. int64_t copy_end_us = ggml_time_us();
  929. copy_us[split_backend_id] += copy_end_us - copy_start_us;
  930. #if 0
  931. char split_filename[GGML_MAX_NAME];
  932. snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
  933. ggml_graph_dump_dot(split->graph, NULL, split_filename);
  934. #endif
  935. uint64_t compute_start_us = ggml_time_us();
  936. ggml_backend_graph_compute(split_backend, &split->graph);
  937. // ggml_backend_synchronize(split_backend);
  938. uint64_t compute_end_us = ggml_time_us();
  939. compute_us[split_backend_id] += compute_end_us - compute_start_us;
  940. }
  941. #if 0
  942. // per-backend timings
  943. fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
  944. for (int i = 0; i < sched->n_backends; i++) {
  945. if (copy_us[i] > 0 || compute_us[i] > 0) {
  946. fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
  947. }
  948. }
  949. #endif
  950. }
  951. static void sched_reset(ggml_backend_sched_t sched) {
  952. for (int i = 0; i < sched->n_backends; i++) {
  953. ggml_tallocr_reset(sched->tallocs[i]);
  954. }
  955. }
  956. ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
  957. GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
  958. struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
  959. memset(sched, 0, sizeof(struct ggml_backend_sched));
  960. sched->n_backends = n_backends;
  961. for (int i = 0; i < n_backends; i++) {
  962. sched->backends[i] = backends[i];
  963. }
  964. sched->galloc = ggml_gallocr_new();
  965. // init measure allocs for each backend
  966. for (int i = 0; i < n_backends; i++) {
  967. sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
  968. }
  969. return sched;
  970. }
  971. void ggml_backend_sched_free(ggml_backend_sched_t sched) {
  972. if (sched == NULL) {
  973. return;
  974. }
  975. for (int i = 0; i < sched->n_backends; i++) {
  976. ggml_tallocr_free(sched->tallocs[i]);
  977. }
  978. ggml_gallocr_free(sched->galloc);
  979. free(sched->hash_set.keys);
  980. free(sched->node_talloc);
  981. free(sched->node_copies);
  982. free(sched);
  983. }
  984. void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
  985. // initialize hash tables
  986. size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
  987. sched->hash_set.size = hash_size;
  988. sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
  989. sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
  990. sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
  991. sched_split_graph(sched, measure_graph);
  992. sched_alloc_splits(sched);
  993. // allocate buffers and reset allocators
  994. for (int i = 0; i < sched->n_backends; i++) {
  995. size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
  996. ggml_tallocr_free(sched->tallocs[i]);
  997. sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
  998. }
  999. sched_reset(sched);
  1000. }
  1001. void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
  1002. GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
  1003. sched_split_graph(sched, graph);
  1004. sched_alloc_splits(sched);
  1005. sched_compute_splits(sched);
  1006. sched_reset(sched);
  1007. }
  1008. ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
  1009. int backend_index = sched_backend_prio(sched, backend);
  1010. return sched->tallocs[backend_index];
  1011. }
  1012. ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
  1013. int backend_index = sched_backend_prio(sched, backend);
  1014. return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
  1015. }
  1016. void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
  1017. int backend_index = sched_backend_prio(sched, backend);
  1018. GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
  1019. node_allocr(node) = sched->tallocs[backend_index];
  1020. }
  1021. // utils
  1022. void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  1023. GGML_ASSERT(tensor->buffer == NULL);
  1024. //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
  1025. GGML_ASSERT(tensor->view_src != NULL);
  1026. GGML_ASSERT(tensor->view_src->buffer != NULL);
  1027. GGML_ASSERT(tensor->view_src->data != NULL);
  1028. tensor->buffer = buffer;
  1029. tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
  1030. tensor->backend = tensor->view_src->backend;
  1031. ggml_backend_buffer_init_tensor(buffer, tensor);
  1032. }
  1033. void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
  1034. GGML_ASSERT(tensor->buffer == NULL);
  1035. GGML_ASSERT(tensor->data == NULL);
  1036. GGML_ASSERT(tensor->view_src == NULL);
  1037. GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
  1038. GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
  1039. (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
  1040. tensor->buffer = buffer;
  1041. tensor->data = addr;
  1042. ggml_backend_buffer_init_tensor(buffer, tensor);
  1043. }
  1044. static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
  1045. struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
  1046. GGML_ASSERT(src != NULL);
  1047. GGML_ASSERT(src->data && "graph must be allocated");
  1048. size_t id = ggml_hash_insert(hash_set, src);
  1049. if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
  1050. return node_copies[ggml_hash_find(hash_set, src)];
  1051. }
  1052. struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
  1053. if (src->view_src != NULL) {
  1054. dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
  1055. dst->view_offs = src->view_offs;
  1056. }
  1057. dst->op = src->op;
  1058. memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
  1059. ggml_set_name(dst, src->name);
  1060. // copy src
  1061. for (int i = 0; i < GGML_MAX_SRC; i++) {
  1062. struct ggml_tensor * s = src->src[i];
  1063. if (s == NULL) {
  1064. break;
  1065. }
  1066. dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
  1067. }
  1068. node_copies[id] = dst;
  1069. return dst;
  1070. }
  1071. static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
  1072. size_t id = ggml_hash_find(hash_set, src);
  1073. if (node_init[id]) {
  1074. return;
  1075. }
  1076. node_init[id] = true;
  1077. struct ggml_tensor * dst = node_copies[id];
  1078. if (dst->view_src != NULL) {
  1079. ggml_backend_view_init(dst->view_src->buffer, dst);
  1080. }
  1081. else {
  1082. ggml_backend_tensor_copy(src, dst);
  1083. }
  1084. // init src
  1085. for (int i = 0; i < GGML_MAX_SRC; i++) {
  1086. struct ggml_tensor * s = src->src[i];
  1087. if (s == NULL) {
  1088. break;
  1089. }
  1090. graph_init_tensor(hash_set, node_copies, node_init, s);
  1091. }
  1092. }
  1093. struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
  1094. struct ggml_hash_set hash_set = {
  1095. /* .size = */ graph->visited_hash_table.size,
  1096. /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
  1097. };
  1098. struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
  1099. bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
  1100. struct ggml_init_params params = {
  1101. /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
  1102. /* .mem_buffer = */ NULL,
  1103. /* .no_alloc = */ true
  1104. };
  1105. struct ggml_context * ctx_allocated = ggml_init(params);
  1106. struct ggml_context * ctx_unallocated = ggml_init(params);
  1107. // dup nodes
  1108. for (int i = 0; i < graph->n_nodes; i++) {
  1109. struct ggml_tensor * node = graph->nodes[i];
  1110. graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
  1111. }
  1112. // allocate nodes
  1113. ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
  1114. //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
  1115. // copy data and init views
  1116. for (int i = 0; i < graph->n_nodes; i++) {
  1117. struct ggml_tensor * node = graph->nodes[i];
  1118. graph_init_tensor(hash_set, node_copies, node_init, node);
  1119. }
  1120. // build graph copy
  1121. struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
  1122. for (int i = 0; i < graph->n_nodes; i++) {
  1123. struct ggml_tensor * node = graph->nodes[i];
  1124. struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
  1125. graph_copy->nodes[i] = node_copy;
  1126. }
  1127. graph_copy->n_nodes = graph->n_nodes;
  1128. free(hash_set.keys);
  1129. free(node_copies);
  1130. free(node_init);
  1131. return (struct ggml_backend_graph_copy) {
  1132. /* .buffer = */ buffer,
  1133. /* .ctx_allocated = */ ctx_allocated,
  1134. /* .ctx_unallocated = */ ctx_unallocated,
  1135. /* .graph = */ graph_copy,
  1136. };
  1137. }
  1138. void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
  1139. ggml_backend_buffer_free(copy.buffer);
  1140. ggml_free(copy.ctx_allocated);
  1141. ggml_free(copy.ctx_unallocated);
  1142. }
  1143. void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
  1144. struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
  1145. struct ggml_cgraph * g1 = graph;
  1146. struct ggml_cgraph * g2 = copy.graph;
  1147. assert(g1->n_nodes == g2->n_nodes);
  1148. for (int i = 0; i < g1->n_nodes; i++) {
  1149. //printf("eval %d/%d\n", i, g1->n_nodes);
  1150. struct ggml_tensor * t1 = g1->nodes[i];
  1151. struct ggml_tensor * t2 = g2->nodes[i];
  1152. assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
  1153. struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
  1154. struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
  1155. ggml_backend_graph_compute(backend1, &g1v);
  1156. ggml_backend_graph_compute(backend2, &g2v);
  1157. if (ggml_is_view_op(t1->op)) {
  1158. continue;
  1159. }
  1160. // compare results, calculate rms etc
  1161. if (!callback(i, t1, t2, user_data)) {
  1162. break;
  1163. }
  1164. }
  1165. ggml_backend_graph_copy_free(copy);
  1166. }