ggml-backend.cpp 85 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217
  1. // Note: porting this file to C++ is a work in progress
  2. #ifdef _WIN32
  3. #define WIN32_LEAN_AND_MEAN
  4. #ifndef NOMINMAX
  5. # define NOMINMAX
  6. #endif
  7. #include <windows.h>
  8. #endif
  9. #include "ggml-backend.h"
  10. #include "ggml-backend-impl.h"
  11. #include "ggml-alloc.h"
  12. #include "ggml-impl.h"
  13. #include <assert.h>
  14. #include <limits.h>
  15. #include <stdarg.h>
  16. #include <stdio.h>
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include <algorithm>
  20. #include <vector>
  21. #ifdef __APPLE__
  22. #include <sys/types.h>
  23. #include <sys/sysctl.h>
  24. #endif
  25. // backend buffer type
  26. const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
  27. GGML_ASSERT(buft);
  28. return buft->iface.get_name(buft);
  29. }
  30. ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  31. if (size == 0) {
  32. // return a dummy buffer for zero-sized allocations
  33. return ggml_backend_buffer_init(buft, {}, NULL, 0);
  34. }
  35. GGML_ASSERT(buft);
  36. return buft->iface.alloc_buffer(buft, size);
  37. }
  38. size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
  39. GGML_ASSERT(buft);
  40. return buft->iface.get_alignment(buft);
  41. }
  42. size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
  43. GGML_ASSERT(buft);
  44. // get_max_size is optional, defaults to SIZE_MAX
  45. if (buft->iface.get_max_size) {
  46. return buft->iface.get_max_size(buft);
  47. }
  48. return SIZE_MAX;
  49. }
  50. size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
  51. GGML_ASSERT(buft);
  52. // get_alloc_size is optional, defaults to ggml_nbytes
  53. if (buft->iface.get_alloc_size) {
  54. size_t size = buft->iface.get_alloc_size(buft, tensor);
  55. assert(size >= ggml_nbytes(tensor));
  56. return size;
  57. }
  58. return ggml_nbytes(tensor);
  59. }
  60. bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
  61. GGML_ASSERT(buft);
  62. if (buft->iface.is_host) {
  63. return buft->iface.is_host(buft);
  64. }
  65. return false;
  66. }
  67. ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
  68. GGML_ASSERT(buft);
  69. return buft->device;
  70. }
  71. // backend buffer
  72. ggml_backend_buffer_t ggml_backend_buffer_init(
  73. ggml_backend_buffer_type_t buft,
  74. struct ggml_backend_buffer_i iface,
  75. void * context,
  76. size_t size) {
  77. ggml_backend_buffer_t buffer = new ggml_backend_buffer {
  78. /* .interface = */ iface,
  79. /* .buft = */ buft,
  80. /* .context = */ context,
  81. /* .size = */ size,
  82. /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
  83. };
  84. return buffer;
  85. }
  86. const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
  87. return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
  88. }
  89. void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
  90. if (buffer == NULL) {
  91. return;
  92. }
  93. if (buffer->iface.free_buffer != NULL) {
  94. buffer->iface.free_buffer(buffer);
  95. }
  96. delete buffer;
  97. }
  98. size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
  99. GGML_ASSERT(buffer);
  100. return buffer->size;
  101. }
  102. void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
  103. GGML_ASSERT(buffer);
  104. // get_base is optional if the buffer is zero-sized
  105. if (buffer->size == 0) {
  106. return NULL;
  107. }
  108. void * base = buffer->iface.get_base(buffer);
  109. GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
  110. return base;
  111. }
  112. enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  113. GGML_ASSERT(buffer);
  114. // init_tensor is optional
  115. if (buffer->iface.init_tensor) {
  116. return buffer->iface.init_tensor(buffer, tensor);
  117. }
  118. return GGML_STATUS_SUCCESS;
  119. }
  120. void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
  121. GGML_ASSERT(buffer);
  122. // clear is optional if the buffer is zero-sized
  123. if (buffer->size == 0) {
  124. return;
  125. }
  126. buffer->iface.clear(buffer, value);
  127. }
  128. size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
  129. return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
  130. }
  131. size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
  132. return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
  133. }
  134. size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
  135. return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
  136. }
  137. bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
  138. return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
  139. }
  140. void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
  141. GGML_ASSERT(buffer);
  142. buffer->usage = usage;
  143. // FIXME: add a generic callback to the buffer interface
  144. if (ggml_backend_buffer_is_multi_buffer(buffer)) {
  145. ggml_backend_multi_buffer_set_usage(buffer, usage);
  146. }
  147. }
  148. enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
  149. GGML_ASSERT(buffer);
  150. return buffer->usage;
  151. }
  152. ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
  153. GGML_ASSERT(buffer);
  154. return buffer->buft;
  155. }
  156. void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
  157. GGML_ASSERT(buffer);
  158. if (buffer->iface.reset) {
  159. buffer->iface.reset(buffer);
  160. }
  161. }
  162. bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
  163. ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
  164. if (dst_buf->iface.cpy_tensor) {
  165. return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
  166. }
  167. return false;
  168. }
  169. // backend
  170. ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
  171. if (backend == NULL) {
  172. return NULL;
  173. }
  174. return backend->guid;
  175. }
  176. const char * ggml_backend_name(ggml_backend_t backend) {
  177. if (backend == NULL) {
  178. return "NULL";
  179. }
  180. return backend->iface.get_name(backend);
  181. }
  182. void ggml_backend_free(ggml_backend_t backend) {
  183. if (backend == NULL) {
  184. return;
  185. }
  186. backend->iface.free(backend);
  187. }
  188. ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
  189. GGML_ASSERT(backend);
  190. return ggml_backend_dev_buffer_type(backend->device);
  191. }
  192. ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
  193. return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
  194. }
  195. size_t ggml_backend_get_alignment(ggml_backend_t backend) {
  196. return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
  197. }
  198. size_t ggml_backend_get_max_size(ggml_backend_t backend) {
  199. return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
  200. }
  201. void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  202. GGML_ASSERT(backend);
  203. GGML_ASSERT(tensor);
  204. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  205. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
  206. if (backend->iface.set_tensor_async == NULL) {
  207. ggml_backend_tensor_set(tensor, data, offset, size);
  208. } else {
  209. backend->iface.set_tensor_async(backend, tensor, data, offset, size);
  210. }
  211. }
  212. void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  213. GGML_ASSERT(backend);
  214. GGML_ASSERT(tensor);
  215. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  216. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
  217. if (backend->iface.get_tensor_async == NULL) {
  218. ggml_backend_tensor_get(tensor, data, offset, size);
  219. } else {
  220. backend->iface.get_tensor_async(backend, tensor, data, offset, size);
  221. }
  222. }
  223. void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  224. GGML_ASSERT(tensor);
  225. ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
  226. if (size == 0) {
  227. return;
  228. }
  229. GGML_ASSERT(buf != NULL && "tensor buffer not set");
  230. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  231. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
  232. buf->iface.set_tensor(buf, tensor, data, offset, size);
  233. }
  234. void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  235. GGML_ASSERT(tensor);
  236. ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
  237. if (size == 0) {
  238. return;
  239. }
  240. GGML_ASSERT(buf != NULL && "tensor buffer not set");
  241. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  242. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
  243. buf->iface.get_tensor(buf, tensor, data, offset, size);
  244. }
  245. void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
  246. GGML_ASSERT(tensor);
  247. ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
  248. if (size == 0) {
  249. return;
  250. }
  251. GGML_ASSERT(buf != NULL && "tensor buffer not set");
  252. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  253. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
  254. GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
  255. buf->iface.memset_tensor(buf, tensor, value, offset, size);
  256. }
  257. void ggml_backend_synchronize(ggml_backend_t backend) {
  258. GGML_ASSERT(backend);
  259. if (backend->iface.synchronize == NULL) {
  260. return;
  261. }
  262. backend->iface.synchronize(backend);
  263. }
  264. ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  265. GGML_ASSERT(backend);
  266. GGML_ASSERT(backend->iface.graph_plan_create != NULL);
  267. return backend->iface.graph_plan_create(backend, cgraph);
  268. }
  269. void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  270. GGML_ASSERT(backend);
  271. GGML_ASSERT(backend->iface.graph_plan_free != NULL);
  272. backend->iface.graph_plan_free(backend, plan);
  273. }
  274. enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  275. GGML_ASSERT(backend);
  276. GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
  277. return backend->iface.graph_plan_compute(backend, plan);
  278. }
  279. enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  280. enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
  281. ggml_backend_synchronize(backend);
  282. return err;
  283. }
  284. enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  285. GGML_ASSERT(backend);
  286. return backend->iface.graph_compute(backend, cgraph);
  287. }
  288. bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
  289. GGML_ASSERT(backend);
  290. return ggml_backend_dev_supports_op(backend->device, op);
  291. }
  292. bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
  293. GGML_ASSERT(backend);
  294. return ggml_backend_dev_supports_buft(backend->device, buft);
  295. }
  296. bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
  297. GGML_ASSERT(backend);
  298. return ggml_backend_dev_offload_op(backend->device, op);
  299. }
  300. ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
  301. GGML_ASSERT(backend);
  302. return backend->device;
  303. }
  304. // backend copy
  305. void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
  306. GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
  307. if (src == dst) {
  308. return;
  309. }
  310. if (ggml_backend_buffer_is_host(src->buffer)) {
  311. ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
  312. } else if (ggml_backend_buffer_is_host(dst->buffer)) {
  313. ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
  314. } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
  315. #ifndef NDEBUG
  316. GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
  317. #endif
  318. size_t nbytes = ggml_nbytes(src);
  319. void * data = malloc(nbytes);
  320. ggml_backend_tensor_get(src, data, 0, nbytes);
  321. ggml_backend_tensor_set(dst, data, 0, nbytes);
  322. free(data);
  323. }
  324. }
  325. void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
  326. GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
  327. if (src == dst) {
  328. return;
  329. }
  330. GGML_ASSERT(backend_dst);
  331. if (backend_dst->iface.cpy_tensor_async != NULL) {
  332. if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
  333. return;
  334. }
  335. }
  336. // an async copy would normally happen after all the queued operations on both backends are completed
  337. // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
  338. ggml_backend_synchronize(backend_src);
  339. ggml_backend_synchronize(backend_dst);
  340. ggml_backend_tensor_copy(src, dst);
  341. }
  342. // events
  343. ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
  344. // null device is allowed for the transition period to the device interface
  345. if (device == NULL || device->iface.event_new == NULL) {
  346. return NULL;
  347. }
  348. return device->iface.event_new(device);
  349. }
  350. void ggml_backend_event_free(ggml_backend_event_t event) {
  351. if (event == NULL) {
  352. return;
  353. }
  354. event->device->iface.event_free(event->device, event);
  355. }
  356. void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
  357. GGML_ASSERT(backend);
  358. GGML_ASSERT(backend->iface.event_record != NULL);
  359. backend->iface.event_record(backend, event);
  360. }
  361. void ggml_backend_event_synchronize(ggml_backend_event_t event) {
  362. GGML_ASSERT(event);
  363. GGML_ASSERT(event->device->iface.event_synchronize);
  364. event->device->iface.event_synchronize(event->device, event);
  365. }
  366. void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
  367. GGML_ASSERT(backend);
  368. GGML_ASSERT(backend->iface.event_wait != NULL);
  369. backend->iface.event_wait(backend, event);
  370. }
  371. static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  372. GGML_ASSERT(backend);
  373. if (backend->iface.graph_optimize != NULL) {
  374. backend->iface.graph_optimize(backend, cgraph);
  375. }
  376. }
  377. // Backend device
  378. const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
  379. GGML_ASSERT(device);
  380. return device->iface.get_name(device);
  381. }
  382. const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
  383. GGML_ASSERT(device);
  384. return device->iface.get_description(device);
  385. }
  386. void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
  387. GGML_ASSERT(device);
  388. device->iface.get_memory(device, free, total);
  389. }
  390. enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
  391. GGML_ASSERT(device);
  392. return device->iface.get_type(device);
  393. }
  394. void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
  395. memset(props, 0, sizeof(*props));
  396. device->iface.get_props(device, props);
  397. }
  398. ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
  399. GGML_ASSERT(device);
  400. return device->reg;
  401. }
  402. ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
  403. GGML_ASSERT(device);
  404. return device->iface.init_backend(device, params);
  405. }
  406. ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
  407. GGML_ASSERT(device);
  408. return device->iface.get_buffer_type(device);
  409. }
  410. ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
  411. GGML_ASSERT(device);
  412. if (device->iface.get_host_buffer_type == NULL) {
  413. return NULL;
  414. }
  415. return device->iface.get_host_buffer_type(device);
  416. }
  417. ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
  418. GGML_ASSERT(device);
  419. return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
  420. }
  421. bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
  422. GGML_ASSERT(device);
  423. return device->iface.supports_op(device, op);
  424. }
  425. bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
  426. GGML_ASSERT(device);
  427. return device->iface.supports_buft(device, buft);
  428. }
  429. bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
  430. GGML_ASSERT(device);
  431. if (device->iface.offload_op != NULL) {
  432. return device->iface.offload_op(device, op);
  433. }
  434. return false;
  435. }
  436. // Backend (reg)
  437. const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
  438. GGML_ASSERT(reg);
  439. return reg->iface.get_name(reg);
  440. }
  441. size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
  442. GGML_ASSERT(reg);
  443. return reg->iface.get_device_count(reg);
  444. }
  445. ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
  446. GGML_ASSERT(reg);
  447. return reg->iface.get_device(reg, index);
  448. }
  449. void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
  450. GGML_ASSERT(reg);
  451. if (!reg->iface.get_proc_address) {
  452. return NULL;
  453. }
  454. return reg->iface.get_proc_address(reg, name);
  455. }
  456. // multi-buffer buffer
  457. struct ggml_backend_multi_buffer_context {
  458. ggml_backend_buffer_t * buffers;
  459. size_t n_buffers;
  460. };
  461. static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
  462. GGML_ASSERT(buffer);
  463. ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
  464. for (size_t i = 0; i < ctx->n_buffers; i++) {
  465. ggml_backend_buffer_free(ctx->buffers[i]);
  466. }
  467. free(ctx->buffers);
  468. free(ctx);
  469. }
  470. static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
  471. GGML_ASSERT(buffer);
  472. ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
  473. for (size_t i = 0; i < ctx->n_buffers; i++) {
  474. ggml_backend_buffer_clear(ctx->buffers[i], value);
  475. }
  476. }
  477. static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
  478. /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
  479. /* .get_base = */ NULL,
  480. /* .init_tensor = */ NULL,
  481. /* .memset_tensor = */ NULL,
  482. /* .set_tensor = */ NULL,
  483. /* .get_tensor = */ NULL,
  484. /* .cpy_tensor = */ NULL,
  485. /* .clear = */ ggml_backend_multi_buffer_clear,
  486. /* .reset = */ NULL,
  487. };
  488. ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
  489. ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) malloc(sizeof(struct ggml_backend_multi_buffer_context));
  490. ctx->n_buffers = n_buffers;
  491. ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
  492. GGML_ASSERT(ctx->buffers != NULL);
  493. size_t total_size = 0;
  494. for (size_t i = 0; i < n_buffers; i++) {
  495. ctx->buffers[i] = buffers[i];
  496. total_size += ggml_backend_buffer_get_size(buffers[i]);
  497. }
  498. return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_i, ctx, total_size);
  499. }
  500. bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
  501. GGML_ASSERT(buffer);
  502. return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
  503. }
  504. void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
  505. GGML_ASSERT(buffer);
  506. GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
  507. ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
  508. for (size_t i = 0; i < ctx->n_buffers; i++) {
  509. ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
  510. }
  511. }
  512. // creates a copy of the tensor with the same memory layout
  513. static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
  514. struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
  515. for (int i = 0; i < GGML_MAX_DIMS; i++) {
  516. dup->nb[i] = tensor->nb[i];
  517. }
  518. return dup;
  519. }
  520. static bool ggml_is_view_op(enum ggml_op op) {
  521. return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
  522. }
  523. // scheduler
  524. #ifndef GGML_SCHED_MAX_BACKENDS
  525. #define GGML_SCHED_MAX_BACKENDS 16
  526. #endif
  527. #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
  528. #define GGML_SCHED_MAX_SPLIT_INPUTS 30
  529. #endif
  530. #ifndef GGML_SCHED_MAX_COPIES
  531. #define GGML_SCHED_MAX_COPIES 4
  532. #endif
  533. struct ggml_backend_sched_split {
  534. int backend_id;
  535. int i_start;
  536. int i_end;
  537. struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
  538. int n_inputs;
  539. // graph view of this split
  540. struct ggml_cgraph graph;
  541. };
  542. struct ggml_backend_sched {
  543. bool is_reset; // true if the scheduler has been reset since the last graph split
  544. bool is_alloc;
  545. int n_backends;
  546. ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
  547. ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
  548. ggml_gallocr_t galloc;
  549. // hash map of the nodes in the graph
  550. struct ggml_hash_set hash_set;
  551. int * hv_tensor_backend_ids; // [hash_set.size]
  552. struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
  553. int * node_backend_ids; // [graph_size]
  554. int * leaf_backend_ids; // [graph_size]
  555. int * prev_node_backend_ids; // [graph_size]
  556. int * prev_leaf_backend_ids; // [graph_size]
  557. // copy of the graph with modified inputs
  558. struct ggml_cgraph graph;
  559. // graph splits
  560. struct ggml_backend_sched_split * splits;
  561. int n_splits;
  562. int splits_capacity;
  563. // pipeline parallelism support
  564. int n_copies;
  565. int cur_copy;
  566. int next_copy;
  567. ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
  568. struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
  569. int n_graph_inputs;
  570. struct ggml_context * ctx;
  571. ggml_backend_sched_eval_callback callback_eval;
  572. void * callback_eval_user_data;
  573. char * context_buffer;
  574. size_t context_buffer_size;
  575. bool op_offload;
  576. int debug;
  577. };
  578. #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
  579. #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
  580. #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
  581. #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
  582. // returns the priority of the backend, lower id is higher priority
  583. static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
  584. for (int i = 0; i < sched->n_backends; i++) {
  585. if (sched->backends[i] == backend) {
  586. return i;
  587. }
  588. }
  589. return -1;
  590. }
  591. static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
  592. ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
  593. if (buffer == NULL) {
  594. return -1;
  595. }
  596. // find highest prio backend that supports the buffer type and the op
  597. for (int i = 0; i < sched->n_backends; i++) {
  598. if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
  599. ggml_backend_supports_op(sched->backends[i], op)) {
  600. return i;
  601. }
  602. }
  603. #ifndef NDEBUG
  604. GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
  605. __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
  606. #endif
  607. return -1;
  608. }
  609. #if 0
  610. #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
  611. static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
  612. #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
  613. #define GET_CAUSE(node) causes[hash_id(node)]
  614. #else
  615. #define SET_CAUSE(node, ...)
  616. #define GET_CAUSE(node) ""
  617. #endif
  618. // returns the backend that should be used for the node based on the current locations
  619. static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
  620. // assign pre-allocated nodes to their backend
  621. int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
  622. if (cur_backend_id != -1) {
  623. SET_CAUSE(tensor, "1.dst");
  624. return cur_backend_id;
  625. }
  626. // view_src
  627. if (tensor->view_src != NULL) {
  628. cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
  629. if (cur_backend_id != -1) {
  630. SET_CAUSE(tensor, "1.vsrc");
  631. return cur_backend_id;
  632. }
  633. }
  634. if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
  635. // since the tensor is pre-allocated, it cannot be moved to another backend
  636. ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
  637. GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
  638. }
  639. // graph input
  640. if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
  641. cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
  642. SET_CAUSE(tensor, "1.inp");
  643. return cur_backend_id;
  644. }
  645. // operations with weights are preferably run on the same backend as the weights
  646. for (int i = 0; i < GGML_MAX_SRC; i++) {
  647. const struct ggml_tensor * src = tensor->src[i];
  648. if (src == NULL) {
  649. continue;
  650. }
  651. // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
  652. // not an ideal solution
  653. if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
  654. int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
  655. // check if a backend with higher prio wants to offload the op
  656. if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
  657. for (int b = 0; b < src_backend_id; b++) {
  658. if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
  659. SET_CAUSE(tensor, "1.off");
  660. return b;
  661. }
  662. }
  663. }
  664. SET_CAUSE(tensor, "1.wgt%d", i);
  665. return src_backend_id;
  666. }
  667. }
  668. return -1;
  669. }
  670. static char * fmt_size(size_t size) {
  671. static char buffer[128];
  672. if (size >= 1024*1024) {
  673. snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
  674. } else {
  675. snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
  676. }
  677. return buffer;
  678. }
  679. static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
  680. int cur_split = 0;
  681. for (int i = 0; i < graph->n_nodes; i++) {
  682. if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
  683. ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
  684. GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
  685. sched->splits[cur_split].n_inputs);
  686. for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
  687. if (j == 0) {
  688. GGML_LOG_DEBUG(": ");
  689. }
  690. GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
  691. fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
  692. }
  693. GGML_LOG_DEBUG("\n");
  694. cur_split++;
  695. }
  696. struct ggml_tensor * node = graph->nodes[i];
  697. if (ggml_is_view_op(node->op)) {
  698. continue;
  699. }
  700. if (sched->debug > 1) {
  701. ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
  702. GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
  703. fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
  704. graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
  705. for (int j = 0; j < GGML_MAX_SRC; j++) {
  706. struct ggml_tensor * src = node->src[j];
  707. if (src == NULL) {
  708. continue;
  709. }
  710. ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
  711. GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
  712. fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
  713. }
  714. GGML_LOG_DEBUG("\n");
  715. }
  716. }
  717. }
  718. static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
  719. ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
  720. ggml_backend_buffer_type_t buft = NULL;
  721. if (buf) {
  722. // the tensor is already allocated
  723. buft = buf->buft;
  724. } else {
  725. // see if the tensor already has a backend assigned, and use the buffer type of that backend
  726. int tensor_backend_id = tensor_backend_id(t);
  727. if (tensor_backend_id == -1 && t->view_src) {
  728. tensor_backend_id = tensor_backend_id(t->view_src);
  729. }
  730. if (tensor_backend_id != -1) {
  731. buft = sched->bufts[tensor_backend_id];
  732. }
  733. }
  734. return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
  735. }
  736. static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
  737. if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
  738. *node_backend_id = cur_backend_id;
  739. SET_CAUSE(node, "2.sup");
  740. }
  741. }
  742. // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
  743. void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
  744. // reset splits
  745. sched->n_splits = 0;
  746. sched->n_graph_inputs = 0;
  747. sched->is_reset = false;
  748. struct ggml_init_params params = {
  749. /* .mem_size = */ sched->context_buffer_size,
  750. /* .mem_buffer = */ sched->context_buffer,
  751. /* .no_alloc = */ true
  752. };
  753. ggml_free(sched->ctx);
  754. sched->ctx = ggml_init(params);
  755. if (sched->ctx == NULL) {
  756. GGML_ABORT("%s: failed to initialize context\n", __func__);
  757. }
  758. // pass 1: assign backends to ops with pre-allocated inputs
  759. for (int i = 0; i < graph->n_leafs; i++) {
  760. struct ggml_tensor * leaf = graph->leafs[i];
  761. int * leaf_backend_id = &tensor_backend_id(leaf);
  762. // do not overwrite user assignments
  763. if (*leaf_backend_id == -1) {
  764. *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
  765. }
  766. }
  767. for (int i = 0; i < graph->n_nodes; i++) {
  768. struct ggml_tensor * node = graph->nodes[i];
  769. int * node_backend_id = &tensor_backend_id(node);
  770. // do not overwrite user assignments
  771. if (*node_backend_id == -1) {
  772. *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
  773. #if 0
  774. // src
  775. if (node->op == GGML_OP_NONE) {
  776. continue;
  777. }
  778. for (int j = 0; j < GGML_MAX_SRC; j++) {
  779. struct ggml_tensor * src = node->src[j];
  780. if (src == NULL) {
  781. continue;
  782. }
  783. int * src_backend_id = &tensor_backend_id(src);
  784. if (*src_backend_id == -1) {
  785. *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
  786. }
  787. }
  788. #endif
  789. }
  790. }
  791. // pass 2: expand current backend assignments
  792. // assign the same backend to adjacent nodes
  793. // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
  794. // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
  795. // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
  796. // expand gpu down
  797. {
  798. int cur_backend_id = -1;
  799. for (int i = 0; i < graph->n_nodes; i++) {
  800. struct ggml_tensor * node = graph->nodes[i];
  801. if (ggml_is_view_op(node->op)) {
  802. continue;
  803. }
  804. int * node_backend_id = &tensor_backend_id(node);
  805. if (*node_backend_id != -1) {
  806. if (*node_backend_id == sched->n_backends - 1) {
  807. // skip cpu (lowest prio backend)
  808. cur_backend_id = -1;
  809. } else {
  810. cur_backend_id = *node_backend_id;
  811. }
  812. } else if (cur_backend_id != -1) {
  813. ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
  814. }
  815. }
  816. }
  817. // expand gpu up
  818. {
  819. int cur_backend_id = -1;
  820. for (int i = graph->n_nodes - 1; i >= 0; i--) {
  821. struct ggml_tensor * node = graph->nodes[i];
  822. if (ggml_is_view_op(node->op)) {
  823. continue;
  824. }
  825. int * node_backend_id = &tensor_backend_id(node);
  826. if (*node_backend_id != -1) {
  827. if (*node_backend_id == sched->n_backends - 1) {
  828. // skip cpu (lowest prio backend)
  829. cur_backend_id = -1;
  830. } else {
  831. cur_backend_id = *node_backend_id;
  832. }
  833. } else if (cur_backend_id != -1) {
  834. ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
  835. }
  836. }
  837. }
  838. // expand rest down
  839. {
  840. int cur_backend_id = -1;
  841. for (int i = 0; i < graph->n_nodes; i++) {
  842. struct ggml_tensor * node = graph->nodes[i];
  843. if (ggml_is_view_op(node->op)) {
  844. continue;
  845. }
  846. int * node_backend_id = &tensor_backend_id(node);
  847. if (*node_backend_id != -1) {
  848. cur_backend_id = *node_backend_id;
  849. } else if (cur_backend_id != -1) {
  850. ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
  851. }
  852. }
  853. }
  854. // expand rest up
  855. {
  856. int cur_backend_id = -1;
  857. for (int i = graph->n_nodes - 1; i >= 0; i--) {
  858. struct ggml_tensor * node = graph->nodes[i];
  859. if (ggml_is_view_op(node->op)) {
  860. continue;
  861. }
  862. int * node_backend_id = &tensor_backend_id(node);
  863. if (*node_backend_id != -1) {
  864. cur_backend_id = *node_backend_id;
  865. } else if (cur_backend_id != -1) {
  866. ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
  867. }
  868. }
  869. }
  870. // pass 3: upgrade nodes to higher prio backends with compatible buffer types
  871. // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
  872. // however, we also need to verify that the sources are in compatible buffer types
  873. // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
  874. // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
  875. // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
  876. // additionally, set remaining unassigned nodes to the backend with the most supported inputs
  877. // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
  878. for (int i = 0; i < graph->n_nodes; i++) {
  879. struct ggml_tensor * node = graph->nodes[i];
  880. if (ggml_is_view_op(node->op)) {
  881. continue;
  882. }
  883. int * node_backend_id = &tensor_backend_id(node);
  884. if (*node_backend_id == -1) {
  885. // unassigned node: find the backend with the most supported inputs
  886. int n_supported_best = -1;
  887. for (int b = 0; b < sched->n_backends; b++) {
  888. if (ggml_backend_supports_op(sched->backends[b], node)) {
  889. int n_supported = 0;
  890. for (int j = 0; j < GGML_MAX_SRC; j++) {
  891. struct ggml_tensor * src = node->src[j];
  892. if (src == NULL) {
  893. continue;
  894. }
  895. if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
  896. n_supported++;
  897. }
  898. }
  899. if (n_supported > n_supported_best) {
  900. n_supported_best = n_supported;
  901. *node_backend_id = b;
  902. SET_CAUSE(node, "3.best");
  903. }
  904. }
  905. }
  906. } else {
  907. // assigned node: upgrade to higher prio backend if possible
  908. for (int b = 0; b < *node_backend_id; b++) {
  909. if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
  910. bool supported = true;
  911. for (int j = 0; j < GGML_MAX_SRC; j++) {
  912. struct ggml_tensor * src = node->src[j];
  913. if (src == NULL) {
  914. continue;
  915. }
  916. if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
  917. supported = false;
  918. break;
  919. }
  920. }
  921. if (supported) {
  922. *node_backend_id = b;
  923. SET_CAUSE(node, "3.upg");
  924. break;
  925. }
  926. }
  927. }
  928. }
  929. }
  930. // pass 4: assign backends to remaining src from dst and view_src
  931. for (int i = 0; i < graph->n_nodes; i++) {
  932. struct ggml_tensor * node = graph->nodes[i];
  933. int * cur_backend_id = &tensor_backend_id(node);
  934. if (node->view_src != NULL && *cur_backend_id == -1) {
  935. *cur_backend_id = tensor_backend_id(node->view_src);
  936. SET_CAUSE(node, "4.vsrc");
  937. }
  938. for (int j = 0; j < GGML_MAX_SRC; j++) {
  939. struct ggml_tensor * src = node->src[j];
  940. if (src == NULL) {
  941. continue;
  942. }
  943. int * src_backend_id = &tensor_backend_id(src);
  944. if (*src_backend_id == -1) {
  945. if (src->view_src != NULL) {
  946. // views are always on the same backend as the source
  947. *src_backend_id = tensor_backend_id(src->view_src);
  948. SET_CAUSE(src, "4.vsrc");
  949. } else {
  950. *src_backend_id = *cur_backend_id;
  951. SET_CAUSE(src, "4.cur");
  952. }
  953. }
  954. }
  955. // if the node is still unassigned, assign it to the first backend that supports it
  956. for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
  957. ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
  958. }
  959. GGML_ASSERT(*cur_backend_id != -1);
  960. }
  961. // pass 5: split graph, find tensors that need to be copied
  962. {
  963. int i_split = 0;
  964. struct ggml_backend_sched_split * split = &sched->splits[0];
  965. // find the backend of the first split, skipping view ops
  966. int i = 0;
  967. for (; i < graph->n_nodes; i++) {
  968. struct ggml_tensor * node = graph->nodes[i];
  969. if (!ggml_is_view_op(node->op)) {
  970. split->backend_id = tensor_backend_id(node);
  971. break;
  972. }
  973. }
  974. split->i_start = 0;
  975. split->n_inputs = 0;
  976. int cur_backend_id = split->backend_id;
  977. for (; i < graph->n_nodes; i++) {
  978. struct ggml_tensor * node = graph->nodes[i];
  979. if (ggml_is_view_op(node->op)) {
  980. continue;
  981. }
  982. const int node_backend_id = tensor_backend_id(node);
  983. GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
  984. // check if we should start a new split based on the sources of the current node
  985. bool need_new_split = false;
  986. if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
  987. for (int j = 0; j < GGML_MAX_SRC; j++) {
  988. struct ggml_tensor * src = node->src[j];
  989. if (src == NULL) {
  990. continue;
  991. }
  992. // check if a weight is on a different and incompatible backend
  993. // by starting a new split, the memory of the previously offloaded weights can be reused
  994. if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
  995. int src_backend_id = tensor_backend_id(src);
  996. if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
  997. need_new_split = true;
  998. break;
  999. }
  1000. }
  1001. // check if the split has too many inputs
  1002. // FIXME: count the number of inputs instead of only checking when full
  1003. if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
  1004. const size_t id = hash_id(src);
  1005. int src_backend_id = sched->hv_tensor_backend_ids[id];
  1006. bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
  1007. if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
  1008. need_new_split = true;
  1009. break;
  1010. }
  1011. }
  1012. }
  1013. }
  1014. if (node_backend_id != cur_backend_id || need_new_split) {
  1015. split->i_end = i;
  1016. i_split++;
  1017. if (i_split >= sched->splits_capacity) {
  1018. sched->splits_capacity *= 2;
  1019. sched->splits = (ggml_backend_sched_split *)
  1020. realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
  1021. GGML_ASSERT(sched->splits != NULL);
  1022. }
  1023. split = &sched->splits[i_split];
  1024. split->backend_id = node_backend_id;
  1025. split->i_start = i;
  1026. split->n_inputs = 0;
  1027. cur_backend_id = node_backend_id;
  1028. }
  1029. // find inputs that are not on the same backend
  1030. for (int j = 0; j < GGML_MAX_SRC; j++) {
  1031. struct ggml_tensor * src = node->src[j];
  1032. if (src == NULL) {
  1033. continue;
  1034. }
  1035. size_t src_id = hash_id(src);
  1036. const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
  1037. GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
  1038. if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
  1039. if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
  1040. ggml_backend_t backend = sched->backends[src_backend_id];
  1041. for (int c = 0; c < sched->n_copies; c++) {
  1042. struct ggml_tensor * tensor_copy;
  1043. if (c == sched->cur_copy) {
  1044. tensor_copy = src; // use the original tensor as the current copy
  1045. } else {
  1046. tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
  1047. ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
  1048. }
  1049. if (sched->n_copies > 1) {
  1050. ggml_set_input(tensor_copy);
  1051. ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
  1052. }
  1053. tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
  1054. SET_CAUSE(tensor_copy, "4.cpy");
  1055. }
  1056. int n_graph_inputs = sched->n_graph_inputs++;
  1057. GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
  1058. sched->graph_inputs[n_graph_inputs] = src;
  1059. }
  1060. }
  1061. if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
  1062. // create a copy of the input in the split's backend
  1063. if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
  1064. ggml_backend_t backend = sched->backends[cur_backend_id];
  1065. for (int c = 0; c < sched->n_copies; c++) {
  1066. struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
  1067. ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
  1068. if (sched->n_copies > 1) {
  1069. ggml_set_input(tensor_copy);
  1070. ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
  1071. }
  1072. tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
  1073. SET_CAUSE(tensor_copy, "4.cpy");
  1074. }
  1075. int n_inputs = split->n_inputs++;
  1076. GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
  1077. split->inputs[n_inputs] = src;
  1078. }
  1079. node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
  1080. }
  1081. }
  1082. }
  1083. split->i_end = graph->n_nodes;
  1084. sched->n_splits = i_split + 1;
  1085. }
  1086. if (sched->debug) {
  1087. ggml_backend_sched_print_assignments(sched, graph);
  1088. }
  1089. // swap node_backend_ids and leaf _backend_ids with prevs
  1090. {
  1091. int * tmp = sched->node_backend_ids;
  1092. sched->node_backend_ids = sched->prev_node_backend_ids;
  1093. sched->prev_node_backend_ids = tmp;
  1094. tmp = sched->leaf_backend_ids;
  1095. sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
  1096. sched->prev_leaf_backend_ids = tmp;
  1097. }
  1098. int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
  1099. if (sched->graph.size < graph_size) {
  1100. sched->graph.size = graph_size;
  1101. sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
  1102. sched->graph.leafs = (ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
  1103. GGML_ASSERT(sched->graph.nodes != NULL);
  1104. GGML_ASSERT(sched->graph.leafs != NULL);
  1105. }
  1106. sched->graph.n_nodes = 0;
  1107. sched->graph.n_leafs = 0;
  1108. struct ggml_cgraph * graph_copy = &sched->graph;
  1109. for (int i = 0; i < sched->n_splits; i++) {
  1110. struct ggml_backend_sched_split * split = &sched->splits[i];
  1111. split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
  1112. // Optimize this split of the graph. This needs to happen before we make graph_copy,
  1113. // so they are in sync.
  1114. ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
  1115. // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
  1116. for (int j = 0; j < split->n_inputs; j++) {
  1117. assert(graph_copy->size > (graph_copy->n_nodes + 1));
  1118. struct ggml_tensor * input = split->inputs[j];
  1119. const size_t input_id = hash_id(input);
  1120. struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
  1121. // add a dependency to the input source so that it is not freed before the copy is done
  1122. struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
  1123. input_dep->src[0] = input;
  1124. sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
  1125. graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
  1126. // add a dependency to the input copy so that it is allocated at the start of the split
  1127. sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
  1128. graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
  1129. }
  1130. for (int j = split->i_start; j < split->i_end; j++) {
  1131. assert(graph_copy->size > graph_copy->n_nodes);
  1132. sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
  1133. graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
  1134. }
  1135. }
  1136. if (sched->n_copies > 1) {
  1137. // add input copies as leafs so that they are allocated first
  1138. for (int i = 0; i < sched->n_graph_inputs; i++) {
  1139. struct ggml_tensor * input = sched->graph_inputs[i];
  1140. size_t id = hash_id(input);
  1141. int backend_id = tensor_backend_id(input);
  1142. for (int c = 0; c < sched->n_copies; c++) {
  1143. struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
  1144. sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
  1145. assert(graph_copy->size > graph_copy->n_leafs);
  1146. graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
  1147. }
  1148. }
  1149. for (int i = 0; i < sched->n_splits; i++) {
  1150. struct ggml_backend_sched_split * split = &sched->splits[i];
  1151. int backend_id = split->backend_id;
  1152. for (int j = 0; j < split->n_inputs; j++) {
  1153. struct ggml_tensor * input = split->inputs[j];
  1154. size_t id = hash_id(input);
  1155. for (int c = 0; c < sched->n_copies; c++) {
  1156. struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
  1157. sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
  1158. assert(graph_copy->size > graph_copy->n_leafs);
  1159. graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
  1160. }
  1161. }
  1162. }
  1163. }
  1164. // add leafs from the original graph
  1165. for (int i = 0; i < graph->n_leafs; i++) {
  1166. struct ggml_tensor * leaf = graph->leafs[i];
  1167. sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
  1168. assert(graph_copy->size > graph_copy->n_leafs);
  1169. graph_copy->leafs[graph_copy->n_leafs++] = leaf;
  1170. }
  1171. }
  1172. static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
  1173. bool backend_ids_changed = false;
  1174. for (int i = 0; i < sched->graph.n_nodes; i++) {
  1175. if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
  1176. sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
  1177. backend_ids_changed = true;
  1178. break;
  1179. }
  1180. }
  1181. if (!backend_ids_changed) {
  1182. for (int i = 0; i < sched->graph.n_leafs; i++) {
  1183. if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
  1184. sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
  1185. backend_ids_changed = true;
  1186. break;
  1187. }
  1188. }
  1189. }
  1190. // allocate graph
  1191. if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
  1192. // the re-allocation may cause the split inputs to be moved to a different address
  1193. // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
  1194. for (int i = 0; i < sched->n_backends; i++) {
  1195. ggml_backend_synchronize(sched->backends[i]);
  1196. }
  1197. #ifndef NDEBUG
  1198. GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
  1199. #endif
  1200. ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
  1201. if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
  1202. GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
  1203. return false;
  1204. }
  1205. }
  1206. return true;
  1207. }
  1208. static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
  1209. GGML_ASSERT(sched);
  1210. struct ggml_backend_sched_split * splits = sched->splits;
  1211. ggml_tensor * prev_ids_tensor = nullptr;
  1212. std::vector<int32_t> ids;
  1213. std::vector<ggml_bitset_t> used_ids;
  1214. for (int split_id = 0; split_id < sched->n_splits; split_id++) {
  1215. struct ggml_backend_sched_split * split = &splits[split_id];
  1216. int split_backend_id = split->backend_id;
  1217. ggml_backend_t split_backend = sched->backends[split_backend_id];
  1218. // copy the input tensors to the split backend
  1219. for (int input_id = 0; input_id < split->n_inputs; input_id++) {
  1220. ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
  1221. struct ggml_tensor * input = split->inputs[input_id];
  1222. struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
  1223. if (input->flags & GGML_TENSOR_FLAG_INPUT) {
  1224. // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
  1225. if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
  1226. ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
  1227. } else {
  1228. ggml_backend_synchronize(split_backend);
  1229. }
  1230. ggml_backend_tensor_copy(input, input_cpy);
  1231. } else {
  1232. // wait for the split backend to finish using the input before overwriting it
  1233. if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
  1234. ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
  1235. } else {
  1236. ggml_backend_synchronize(split_backend);
  1237. }
  1238. // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
  1239. ggml_tensor * node = split->graph.nodes[0];
  1240. if (split->graph.n_nodes > 0 &&
  1241. ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
  1242. ggml_backend_buffer_is_host(input->buffer) && (
  1243. (node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
  1244. //|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */
  1245. )) {
  1246. const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
  1247. const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
  1248. ggml_backend_synchronize(input_backend);
  1249. // get the ids
  1250. ggml_tensor * ids_tensor = node->src[2];
  1251. ggml_backend_t ids_backend = split_backend;
  1252. // if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
  1253. // in that case, we use the original ids tensor
  1254. for (int i = input_id + 1; i < split->n_inputs; i++) {
  1255. if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
  1256. ids_tensor = split->inputs[i];
  1257. ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
  1258. break;
  1259. }
  1260. }
  1261. if (ids_tensor != prev_ids_tensor) {
  1262. ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
  1263. ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
  1264. ggml_backend_synchronize(ids_backend);
  1265. // find the used experts
  1266. used_ids.clear();
  1267. used_ids.resize(ggml_bitset_size(n_expert));
  1268. for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
  1269. for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
  1270. int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
  1271. GGML_ASSERT(id >= 0 && id < n_expert);
  1272. ggml_bitset_set(used_ids.data(), id);
  1273. }
  1274. }
  1275. prev_ids_tensor = ids_tensor;
  1276. }
  1277. // group consecutive experts and copy them together
  1278. auto copy_experts = [&](int32_t first_id, int32_t last_id) {
  1279. const size_t expert_offset = first_id * expert_size;
  1280. const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
  1281. const size_t padding = std::min<size_t>(expert_size, 512);
  1282. const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
  1283. ggml_backend_tensor_set_async(split_backend,
  1284. input_cpy,
  1285. (const uint8_t *)input->data + expert_offset, expert_offset,
  1286. // copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
  1287. // this is necessary for MMQ in the CUDA backend
  1288. expert_size_copy + padding_end);
  1289. };
  1290. int id = 0;
  1291. while (!ggml_bitset_get(used_ids.data(), id)) {
  1292. id++;
  1293. }
  1294. int32_t first_id = id;
  1295. int32_t last_id = first_id;
  1296. for (++id; id < n_expert; ++id) {
  1297. if (!ggml_bitset_get(used_ids.data(), id)) {
  1298. continue;
  1299. }
  1300. if (id == last_id + 1) {
  1301. last_id = id;
  1302. continue;
  1303. }
  1304. copy_experts(first_id, last_id);
  1305. first_id = id;
  1306. last_id = id;
  1307. }
  1308. copy_experts(first_id, last_id);
  1309. } else {
  1310. // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
  1311. // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
  1312. if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
  1313. ggml_backend_synchronize(input_backend);
  1314. if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
  1315. ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
  1316. } else {
  1317. ggml_backend_synchronize(split_backend);
  1318. }
  1319. ggml_backend_tensor_copy(input, input_cpy);
  1320. }
  1321. }
  1322. }
  1323. }
  1324. if (!sched->callback_eval) {
  1325. enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
  1326. if (ec != GGML_STATUS_SUCCESS) {
  1327. return ec;
  1328. }
  1329. } else {
  1330. // similar to ggml_backend_compare_graph_backend
  1331. for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
  1332. struct ggml_tensor * t = split->graph.nodes[j0];
  1333. // check if the user needs data from this node
  1334. bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
  1335. int j1 = j0;
  1336. // determine the range [j0, j1] of nodes that can be computed together
  1337. while (!need && j1 < split->graph.n_nodes - 1) {
  1338. t = split->graph.nodes[++j1];
  1339. need = sched->callback_eval(t, true, sched->callback_eval_user_data);
  1340. }
  1341. struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
  1342. enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
  1343. if (ec != GGML_STATUS_SUCCESS) {
  1344. return ec;
  1345. }
  1346. // TODO: pass backend to the callback, then the user can decide if they want to synchronize
  1347. ggml_backend_synchronize(split_backend);
  1348. if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
  1349. break;
  1350. }
  1351. j0 = j1;
  1352. }
  1353. }
  1354. // record the event of this copy
  1355. if (split->n_inputs > 0) {
  1356. if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
  1357. ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
  1358. }
  1359. }
  1360. }
  1361. return GGML_STATUS_SUCCESS;
  1362. }
  1363. ggml_backend_sched_t ggml_backend_sched_new(
  1364. ggml_backend_t * backends,
  1365. ggml_backend_buffer_type_t * bufts,
  1366. int n_backends,
  1367. size_t graph_size,
  1368. bool parallel,
  1369. bool op_offload) {
  1370. GGML_ASSERT(n_backends > 0);
  1371. GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
  1372. GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
  1373. struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
  1374. const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
  1375. sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
  1376. sched->n_backends = n_backends;
  1377. sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
  1378. // initialize hash table
  1379. // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
  1380. sched->hash_set = ggml_hash_set_new(graph_size);
  1381. sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
  1382. sched->hv_tensor_copies = (ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
  1383. const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
  1384. const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
  1385. sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
  1386. sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
  1387. sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
  1388. sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
  1389. sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
  1390. sched->context_buffer = (char *) malloc(sched->context_buffer_size);
  1391. const int initial_splits_capacity = 16;
  1392. sched->splits = (ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
  1393. sched->splits_capacity = initial_splits_capacity;
  1394. for (int b = 0; b < n_backends; b++) {
  1395. sched->backends[b] = backends[b];
  1396. sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
  1397. GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
  1398. if (sched->n_copies > 1) {
  1399. for (int c = 0; c < sched->n_copies; c++) {
  1400. sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
  1401. }
  1402. }
  1403. }
  1404. sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
  1405. sched->op_offload = op_offload;
  1406. ggml_backend_sched_reset(sched);
  1407. return sched;
  1408. }
  1409. void ggml_backend_sched_free(ggml_backend_sched_t sched) {
  1410. if (sched == NULL) {
  1411. return;
  1412. }
  1413. for (int b = 0; b < sched->n_backends; b++) {
  1414. for (int c = 0; c < sched->n_copies; c++) {
  1415. ggml_backend_event_free(sched->events[b][c]);
  1416. }
  1417. }
  1418. ggml_gallocr_free(sched->galloc);
  1419. ggml_free(sched->ctx);
  1420. ggml_hash_set_free(&sched->hash_set);
  1421. free(sched->splits);
  1422. free(sched->hv_tensor_backend_ids);
  1423. free(sched->hv_tensor_copies);
  1424. free(sched->node_backend_ids);
  1425. free(sched->leaf_backend_ids);
  1426. free(sched->prev_node_backend_ids);
  1427. free(sched->prev_leaf_backend_ids);
  1428. free(sched->context_buffer);
  1429. free(sched->graph.nodes);
  1430. free(sched->graph.leafs);
  1431. free(sched);
  1432. }
  1433. void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
  1434. GGML_ASSERT(sched);
  1435. // reset state for the next run
  1436. if (!sched->is_reset) {
  1437. ggml_hash_set_reset(&sched->hash_set);
  1438. memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
  1439. memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
  1440. sched->is_reset = true;
  1441. }
  1442. sched->is_alloc = false;
  1443. }
  1444. bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
  1445. GGML_ASSERT(sched);
  1446. GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
  1447. ggml_backend_sched_reset(sched);
  1448. ggml_backend_sched_synchronize(sched);
  1449. ggml_backend_sched_split_graph(sched, measure_graph);
  1450. if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
  1451. return false;
  1452. }
  1453. ggml_backend_sched_reset(sched);
  1454. return true;
  1455. }
  1456. bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
  1457. GGML_ASSERT(sched);
  1458. GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
  1459. GGML_ASSERT(!sched->is_alloc);
  1460. sched->cur_copy = sched->next_copy;
  1461. sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
  1462. ggml_backend_sched_split_graph(sched, graph);
  1463. if (!ggml_backend_sched_alloc_splits(sched)) {
  1464. return false;
  1465. }
  1466. sched->is_alloc = true;
  1467. return true;
  1468. }
  1469. enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
  1470. enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
  1471. ggml_backend_sched_synchronize(sched);
  1472. return err;
  1473. }
  1474. enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
  1475. GGML_ASSERT(sched);
  1476. if (!sched->is_reset && !sched->is_alloc) {
  1477. ggml_backend_sched_reset(sched);
  1478. }
  1479. if (!sched->is_alloc) {
  1480. if (!ggml_backend_sched_alloc_graph(sched, graph)) {
  1481. return GGML_STATUS_ALLOC_FAILED;
  1482. }
  1483. }
  1484. return ggml_backend_sched_compute_splits(sched);
  1485. }
  1486. void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
  1487. GGML_ASSERT(sched);
  1488. for (int i = 0; i < sched->n_backends; i++) {
  1489. ggml_backend_synchronize(sched->backends[i]);
  1490. }
  1491. if (!sched->is_alloc) {
  1492. // if the graph is not already allocated, always use copy 0 after a synchronization
  1493. // this ensures that during generation the same copy is used every time,
  1494. // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
  1495. sched->next_copy = 0;
  1496. }
  1497. }
  1498. void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
  1499. GGML_ASSERT(sched);
  1500. sched->callback_eval = callback;
  1501. sched->callback_eval_user_data = user_data;
  1502. }
  1503. int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
  1504. GGML_ASSERT(sched);
  1505. return sched->n_splits;
  1506. }
  1507. int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
  1508. GGML_ASSERT(sched);
  1509. return sched->n_copies;
  1510. }
  1511. int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
  1512. GGML_ASSERT(sched);
  1513. return sched->n_backends;
  1514. }
  1515. ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
  1516. GGML_ASSERT(sched);
  1517. GGML_ASSERT(i >= 0 && i < sched->n_backends);
  1518. return sched->backends[i];
  1519. }
  1520. ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
  1521. GGML_ASSERT(sched);
  1522. int backend_index = ggml_backend_sched_backend_id(sched, backend);
  1523. GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
  1524. return sched->bufts[backend_index];
  1525. }
  1526. size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
  1527. GGML_ASSERT(sched);
  1528. int backend_index = ggml_backend_sched_backend_id(sched, backend);
  1529. GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
  1530. return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
  1531. }
  1532. void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
  1533. GGML_ASSERT(sched);
  1534. int backend_index = ggml_backend_sched_backend_id(sched, backend);
  1535. GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
  1536. tensor_backend_id(node) = backend_index;
  1537. SET_CAUSE(node, "usr");
  1538. sched->is_reset = false;
  1539. }
  1540. ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
  1541. GGML_ASSERT(sched);
  1542. int backend_index = tensor_backend_id(node);
  1543. if (backend_index == -1) {
  1544. return NULL;
  1545. }
  1546. return sched->backends[backend_index];
  1547. }
  1548. // utils
  1549. enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
  1550. GGML_ASSERT(tensor);
  1551. GGML_ASSERT(tensor->buffer == NULL);
  1552. GGML_ASSERT(tensor->view_src != NULL);
  1553. GGML_ASSERT(tensor->view_src->buffer != NULL);
  1554. GGML_ASSERT(tensor->view_src->data != NULL);
  1555. tensor->buffer = tensor->view_src->buffer;
  1556. tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
  1557. return ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
  1558. }
  1559. enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
  1560. GGML_ASSERT(tensor);
  1561. GGML_ASSERT(tensor->buffer == NULL);
  1562. GGML_ASSERT(tensor->data == NULL);
  1563. GGML_ASSERT(tensor->view_src == NULL);
  1564. GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
  1565. GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
  1566. (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
  1567. tensor->buffer = buffer;
  1568. tensor->data = addr;
  1569. return ggml_backend_buffer_init_tensor(buffer, tensor);
  1570. }
  1571. static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
  1572. struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
  1573. GGML_ASSERT(src != NULL);
  1574. GGML_ASSERT(src->data && "graph must be allocated");
  1575. size_t id = ggml_hash_insert(&hash_set, src);
  1576. if (id == GGML_HASHSET_ALREADY_EXISTS) {
  1577. return node_copies[ggml_hash_find(&hash_set, src)];
  1578. }
  1579. struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
  1580. if (src->view_src != NULL) {
  1581. dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
  1582. dst->view_offs = src->view_offs;
  1583. }
  1584. dst->op = src->op;
  1585. memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
  1586. ggml_set_name(dst, src->name);
  1587. // copy src
  1588. for (int i = 0; i < GGML_MAX_SRC; i++) {
  1589. struct ggml_tensor * s = src->src[i];
  1590. if (s == NULL) {
  1591. continue;
  1592. }
  1593. dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
  1594. }
  1595. node_copies[id] = dst;
  1596. return dst;
  1597. }
  1598. static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
  1599. size_t id = ggml_hash_find(hash_set, src);
  1600. if (node_init[id]) {
  1601. return;
  1602. }
  1603. node_init[id] = true;
  1604. struct ggml_tensor * dst = node_copies[id];
  1605. if (dst->view_src != NULL) {
  1606. graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
  1607. enum ggml_status status = ggml_backend_view_init(dst);
  1608. GGML_ASSERT(status == GGML_STATUS_SUCCESS);
  1609. }
  1610. else {
  1611. ggml_backend_tensor_copy(src, dst);
  1612. }
  1613. // init src
  1614. for (int i = 0; i < GGML_MAX_SRC; i++) {
  1615. struct ggml_tensor * s = src->src[i];
  1616. if (s == NULL) {
  1617. continue;
  1618. }
  1619. graph_copy_init_tensor(hash_set, node_copies, node_init, s);
  1620. }
  1621. }
  1622. struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
  1623. GGML_ASSERT(graph);
  1624. struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
  1625. struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
  1626. bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
  1627. struct ggml_init_params params = {
  1628. /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
  1629. /* .mem_buffer = */ NULL,
  1630. /* .no_alloc = */ true
  1631. };
  1632. struct ggml_context * ctx_allocated = ggml_init(params);
  1633. struct ggml_context * ctx_unallocated = ggml_init(params);
  1634. if (ctx_allocated == NULL || ctx_unallocated == NULL) {
  1635. GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
  1636. ggml_hash_set_free(&hash_set);
  1637. free(node_copies);
  1638. free(node_init);
  1639. ggml_free(ctx_allocated);
  1640. ggml_free(ctx_unallocated);
  1641. return {
  1642. /* .buffer = */ NULL,
  1643. /* .ctx_allocated = */ NULL,
  1644. /* .ctx_unallocated = */ NULL,
  1645. /* .graph = */ NULL,
  1646. };
  1647. }
  1648. // dup nodes
  1649. for (int i = 0; i < graph->n_nodes; i++) {
  1650. struct ggml_tensor * node = graph->nodes[i];
  1651. graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
  1652. }
  1653. // allocate nodes
  1654. ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
  1655. if (buffer == NULL) {
  1656. GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
  1657. ggml_hash_set_free(&hash_set);
  1658. free(node_copies);
  1659. free(node_init);
  1660. ggml_free(ctx_allocated);
  1661. ggml_free(ctx_unallocated);
  1662. return {
  1663. /* .buffer = */ NULL,
  1664. /* .ctx_allocated = */ NULL,
  1665. /* .ctx_unallocated = */ NULL,
  1666. /* .graph = */ NULL,
  1667. };
  1668. }
  1669. //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
  1670. // copy data and init views
  1671. for (int i = 0; i < graph->n_nodes; i++) {
  1672. struct ggml_tensor * node = graph->nodes[i];
  1673. graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
  1674. }
  1675. // build graph copy
  1676. struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
  1677. for (int i = 0; i < graph->n_nodes; i++) {
  1678. struct ggml_tensor * node = graph->nodes[i];
  1679. struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
  1680. graph_copy->nodes[i] = node_copy;
  1681. }
  1682. graph_copy->n_nodes = graph->n_nodes;
  1683. ggml_hash_set_free(&hash_set);
  1684. free(node_copies);
  1685. free(node_init);
  1686. return {
  1687. /* .buffer = */ buffer,
  1688. /* .ctx_allocated = */ ctx_allocated,
  1689. /* .ctx_unallocated = */ ctx_unallocated,
  1690. /* .graph = */ graph_copy,
  1691. };
  1692. }
  1693. void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
  1694. ggml_backend_buffer_free(copy.buffer);
  1695. ggml_free(copy.ctx_allocated);
  1696. ggml_free(copy.ctx_unallocated);
  1697. }
  1698. bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
  1699. struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
  1700. if (copy.buffer == NULL) {
  1701. return false;
  1702. }
  1703. struct ggml_cgraph * g1 = graph;
  1704. struct ggml_cgraph * g2 = copy.graph;
  1705. assert(g1->n_nodes == g2->n_nodes);
  1706. if (test_node != nullptr) {
  1707. // Compute the whole graph and only test the output for a specific tensor
  1708. ggml_backend_graph_compute(backend1, g1);
  1709. ggml_backend_graph_compute(backend2, g2);
  1710. int test_node_idx = -1;
  1711. for (int i = 0; i < g1->n_nodes; i++) {
  1712. struct ggml_tensor * t1 = g1->nodes[i];
  1713. if (t1 == test_node) {
  1714. test_node_idx = i;
  1715. break;
  1716. }
  1717. }
  1718. GGML_ASSERT(test_node_idx != -1);
  1719. callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
  1720. } else {
  1721. for (int i = 0; i < g1->n_nodes; i++) {
  1722. struct ggml_tensor * t1 = g1->nodes[i];
  1723. struct ggml_tensor * t2 = g2->nodes[i];
  1724. assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
  1725. struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
  1726. struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
  1727. ggml_backend_graph_compute(backend1, &g1v);
  1728. ggml_backend_graph_compute(backend2, &g2v);
  1729. if (ggml_is_view_op(t1->op)) {
  1730. continue;
  1731. }
  1732. // compare results, calculate rms etc
  1733. if (!callback(i, t1, t2, user_data)) {
  1734. break;
  1735. }
  1736. }
  1737. }
  1738. ggml_backend_graph_copy_free(copy);
  1739. return true;
  1740. }
  1741. // CPU backend - buffer
  1742. static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
  1743. GGML_ASSERT(buffer);
  1744. uintptr_t data = (uintptr_t)buffer->context;
  1745. // align the buffer
  1746. if (data % TENSOR_ALIGNMENT != 0) {
  1747. data = GGML_PAD(data, TENSOR_ALIGNMENT);
  1748. }
  1749. return (void *)data;
  1750. }
  1751. static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
  1752. GGML_ASSERT(buffer);
  1753. ggml_aligned_free(buffer->context, buffer->size);
  1754. }
  1755. static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
  1756. GGML_ASSERT(tensor);
  1757. memset((char *)tensor->data + offset, value, size);
  1758. GGML_UNUSED(buffer);
  1759. }
  1760. static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  1761. GGML_ASSERT(tensor);
  1762. memcpy((char *)tensor->data + offset, data, size);
  1763. GGML_UNUSED(buffer);
  1764. }
  1765. static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  1766. GGML_ASSERT(tensor);
  1767. memcpy(data, (const char *)tensor->data + offset, size);
  1768. GGML_UNUSED(buffer);
  1769. }
  1770. static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
  1771. GGML_ASSERT(src);
  1772. if (ggml_backend_buffer_is_host(src->buffer)) {
  1773. memcpy(dst->data, src->data, ggml_nbytes(src));
  1774. return true;
  1775. }
  1776. return false;
  1777. GGML_UNUSED(buffer);
  1778. }
  1779. static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
  1780. GGML_ASSERT(buffer);
  1781. memset(buffer->context, value, buffer->size);
  1782. }
  1783. static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
  1784. /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
  1785. /* .get_base = */ ggml_backend_cpu_buffer_get_base,
  1786. /* .init_tensor = */ NULL, // no initialization required
  1787. /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
  1788. /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
  1789. /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
  1790. /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
  1791. /* .clear = */ ggml_backend_cpu_buffer_clear,
  1792. /* .reset = */ NULL,
  1793. };
  1794. static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
  1795. /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
  1796. /* .get_base = */ ggml_backend_cpu_buffer_get_base,
  1797. /* .init_tensor = */ NULL, // no initialization required
  1798. /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
  1799. /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
  1800. /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
  1801. /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
  1802. /* .clear = */ ggml_backend_cpu_buffer_clear,
  1803. /* .reset = */ NULL,
  1804. };
  1805. // CPU backend buffer type
  1806. // this buffer type is defined here to make it available to all backends
  1807. static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
  1808. return "CPU";
  1809. GGML_UNUSED(buft);
  1810. }
  1811. static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  1812. void * data = ggml_aligned_malloc(size);
  1813. if (data == NULL) {
  1814. GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
  1815. return NULL;
  1816. }
  1817. return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
  1818. }
  1819. static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
  1820. return TENSOR_ALIGNMENT;
  1821. GGML_UNUSED(buft);
  1822. }
  1823. static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
  1824. return true;
  1825. GGML_UNUSED(buft);
  1826. }
  1827. ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
  1828. static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
  1829. /* .iface = */ {
  1830. /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
  1831. /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
  1832. /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
  1833. /* .get_max_size = */ NULL, // defaults to SIZE_MAX
  1834. /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
  1835. /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
  1836. },
  1837. /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
  1838. /* .context = */ NULL,
  1839. };
  1840. return &ggml_backend_cpu_buffer_type;
  1841. }
  1842. static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
  1843. return "CPU_Mapped";
  1844. GGML_UNUSED(buft);
  1845. }
  1846. static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
  1847. static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
  1848. /* .iface = */ {
  1849. /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
  1850. /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
  1851. /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
  1852. /* .get_max_size = */ NULL, // defaults to SIZE_MAX
  1853. /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
  1854. /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
  1855. },
  1856. /* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
  1857. /* .context = */ NULL,
  1858. };
  1859. return &ggml_backend_cpu_buffer_type;
  1860. }
  1861. ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
  1862. GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
  1863. return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
  1864. }