1
0

ggml-hexagon.cpp 135 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757
  1. #include <assert.h>
  2. #include <inttypes.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <string.h>
  6. #include <time.h>
  7. #include <atomic>
  8. #include <chrono>
  9. #include <mutex>
  10. #include <string>
  11. #ifdef _WIN32
  12. # include <sal.h>
  13. # ifndef _WINDOWS
  14. # define _WINDOWS
  15. # endif
  16. #else
  17. # include <semaphore.h>
  18. # include <unistd.h>
  19. #endif
  20. #pragma clang diagnostic ignored "-Wnested-anon-types"
  21. #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
  22. #include "htp-utils.h"
  23. #include <AEEStdErr.h>
  24. #include <dspqueue.h>
  25. #include <rpcmem.h>
  26. #define GGML_COMMON_IMPL_CPP
  27. #include "ggml-backend-impl.h"
  28. #include "ggml-common.h"
  29. #include "ggml-hexagon.h"
  30. #include "ggml-impl.h"
  31. #include "ggml-quants.h"
  32. #include "htp-msg.h"
  33. #include "htp_iface.h"
  34. static size_t opt_ndev = 1;
  35. static size_t opt_nhvx = 0; // use all
  36. static int opt_arch = 0; // autodetect
  37. static int opt_etm = 0;
  38. static int opt_verbose = 0;
  39. static int opt_profile = 0;
  40. static int opt_hostbuf = 1;
  41. static int opt_experimental = 0;
  42. // Enable all stages by default
  43. static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_QUANTIZE | HTP_OPMASK_COMPUTE;
  44. static int opt_opsync = 0; // synchronous ops
  45. #define HEX_VERBOSE(...) \
  46. if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__)
  47. #define HEX_PROFILE(...) \
  48. if (opt_profile) GGML_LOG_INFO(__VA_ARGS__)
  49. static inline uint64_t hex_is_aligned(void * addr, uint32_t align) {
  50. return ((size_t) addr & (align - 1)) == 0;
  51. }
  52. static inline size_t hex_round_up(size_t n, size_t m) {
  53. return m * ((n + m - 1) / m);
  54. }
  55. static const char * status_to_str(uint32_t status) {
  56. switch (status) {
  57. case HTP_STATUS_OK:
  58. return "OK";
  59. case HTP_STATUS_NO_SUPPORT:
  60. return "NO-SUPPORT";
  61. case HTP_STATUS_INVAL_PARAMS:
  62. return "INVAL-PARAMS";
  63. case HTP_STATUS_VTCM_TOO_SMALL:
  64. return "VTCM-TOO-SMALL";
  65. case HTP_STATUS_INTERNAL_ERR:
  66. return "INTERNAL-ERROR";
  67. default:
  68. return "UNKNOWN";
  69. }
  70. }
  71. // ** debug helpers
  72. static inline int hex_format_tensor_dims(char * str, const struct ggml_tensor * t) {
  73. if (t->ne[2] == 1 && t->ne[3] == 1) {
  74. return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
  75. } else {
  76. return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
  77. }
  78. }
  79. static inline void hex_format_op_dims(char * str, const struct ggml_tensor * t) {
  80. char * p = str;
  81. // append src0 and src1 (if any)
  82. if (t->src[0]) {
  83. p += hex_format_tensor_dims(p, t->src[0]);
  84. for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
  85. p += sprintf(p, " x ");
  86. p += hex_format_tensor_dims(p, t->src[i]);
  87. }
  88. p += sprintf(p, " -> ");
  89. }
  90. // format self dims separately for better visual alignment
  91. char self[64];
  92. hex_format_tensor_dims(self, t);
  93. p += sprintf(p, "%s", self);
  94. }
  95. static inline int hex_format_tensor_strides(char * str, const struct ggml_tensor * t) {
  96. const char * c = ggml_is_contiguous(t) ? "" : "!";
  97. if (t->ne[2] == 1 && t->ne[3] == 1) {
  98. return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
  99. } else {
  100. return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2],
  101. (size_t) t->nb[3], c);
  102. }
  103. }
  104. static inline void hex_format_op_strides(char * str, const struct ggml_tensor * t) {
  105. char * p = str;
  106. // append src0 and src1 (if any)
  107. if (t->src[0]) {
  108. p += hex_format_tensor_strides(p, t->src[0]);
  109. for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
  110. p += sprintf(p, " x ");
  111. p += hex_format_tensor_strides(p, t->src[i]);
  112. }
  113. p += sprintf(p, " -> ");
  114. }
  115. // format self dims separately for better visual alignment
  116. char self[64];
  117. hex_format_tensor_strides(self, t);
  118. p += sprintf(p, "%s", self);
  119. }
  120. static inline void hex_format_op_types(char * str, const struct ggml_tensor * t) {
  121. char * p = str;
  122. // append src0 and src1 (if any)
  123. if (t->src[0]) {
  124. p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
  125. for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
  126. p += sprintf(p, " x ");
  127. p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
  128. }
  129. p += sprintf(p, " -> ");
  130. }
  131. p += sprintf(p, "%s", ggml_type_name(t->type));
  132. }
  133. static inline const char * hex_tensor_buff_name(const struct ggml_tensor * t) {
  134. if (t->buffer) {
  135. return ggml_backend_buffer_name(t->buffer);
  136. }
  137. return "NONE";
  138. }
  139. static inline void hex_format_op_buffs(char * str, const struct ggml_tensor * t) {
  140. char * p = str;
  141. // append src0 and src1 (if any)
  142. if (t->src[0]) {
  143. p += sprintf(p, "%s", hex_tensor_buff_name(t->src[0]));
  144. for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
  145. p += sprintf(p, " x ");
  146. p += sprintf(p, "%s", hex_tensor_buff_name(t->src[i]));
  147. }
  148. p += sprintf(p, " -> ");
  149. }
  150. p += sprintf(p, "%s", hex_tensor_buff_name(t));
  151. }
  152. static inline void hex_format_op_names(char * str, const struct ggml_tensor * t) {
  153. char * p = str;
  154. // append src0 and src1 (if any)
  155. if (t->src[0]) {
  156. p += sprintf(p, "%s", t->src[0]->name);
  157. for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
  158. p += sprintf(p, " x ");
  159. p += sprintf(p, "%s", t->src[i]->name);
  160. }
  161. p += sprintf(p, " -> ");
  162. }
  163. p += sprintf(p, "%s", t->name);
  164. }
  165. // ** backend sessions
  166. struct ggml_hexagon_session {
  167. ggml_hexagon_session(int dev_id) noexcept(false);
  168. ~ggml_hexagon_session() noexcept(true);
  169. void allocate(int dev_id) noexcept(false);
  170. void release() noexcept(true);
  171. ggml_backend_buffer_type buffer_type;
  172. ggml_backend_buffer_type repack_buffer_type;
  173. std::string name;
  174. remote_handle64 handle;
  175. dspqueue_t queue;
  176. uint32_t session_id;
  177. uint32_t domain_id;
  178. uint64_t queue_id;
  179. int dev_id;
  180. bool valid_session;
  181. bool valid_handle;
  182. bool valid_queue;
  183. bool valid_iface;
  184. std::atomic<int> op_pending;
  185. uint32_t prof_usecs;
  186. uint32_t prof_cycles;
  187. uint32_t prof_pkts;
  188. };
  189. // Packet callback
  190. static void htp_packet_callback(dspqueue_t queue, AEEResult error, void * context) {
  191. auto sess = static_cast<ggml_hexagon_session *>(context);
  192. // Repeatedly read packets from the queue until it's empty. We don't
  193. // necessarily get a separate callback for each packet, and new packets
  194. // may arrive while we're processing the previous one.
  195. while (1) {
  196. struct htp_general_rsp rsp;
  197. uint32_t rsp_size;
  198. uint32_t flags;
  199. struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
  200. uint32_t n_bufs;
  201. // Read packet from queue
  202. int err = dspqueue_read_noblock(queue, &flags,
  203. HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
  204. &n_bufs, // Number of buffer references
  205. bufs, // Buffer references
  206. sizeof(rsp), // Max message length
  207. &rsp_size, // Message length
  208. (uint8_t *) &rsp);
  209. if (err == AEE_EWOULDBLOCK) {
  210. // Consumed all packets available for now
  211. return;
  212. }
  213. if (err != 0) {
  214. GGML_ABORT("ggml-hex: dspqueue_read_noblock failed: 0x%08x\n", (unsigned) err);
  215. }
  216. // Basic sanity checks
  217. if (rsp_size != sizeof(rsp)) {
  218. GGML_ABORT("ggml-hex: dspcall : bad response (size)\n");
  219. }
  220. if (rsp.status != HTP_STATUS_OK) {
  221. GGML_LOG_ERROR("ggml-hex: dspcall : dsp-rsp: %s\n", status_to_str(rsp.status));
  222. // TODO: handle errors
  223. }
  224. // FIXME: update profiling implementation
  225. sess->prof_usecs = rsp.prof_usecs;
  226. sess->prof_cycles = rsp.prof_cycles;
  227. sess->prof_pkts = rsp.prof_pkts;
  228. sess->op_pending--; // atomic dec
  229. }
  230. }
  231. // Error callback - simply terminates with an error. Used where we don't
  232. // expect errors.
  233. [[noreturn]] static void htp_error_callback(dspqueue_t queue, AEEResult error, void * context) {
  234. GGML_ABORT("ggml-hex: dspcall general error 0x%x: for queue %p\n", error, (void *) queue);
  235. }
  236. // ** backend buffers
  237. struct ggml_backend_hexagon_buffer_type_context {
  238. ggml_backend_hexagon_buffer_type_context(const std::string & name, ggml_hexagon_session * sess) {
  239. this->sess = sess;
  240. this->name = name;
  241. }
  242. ggml_hexagon_session * sess;
  243. std::string name;
  244. };
  245. struct ggml_backend_hexagon_buffer_context {
  246. bool mmap_to(ggml_hexagon_session * s) {
  247. HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
  248. s->name.c_str(), (void *) this->base, s->domain_id, s->session_id, this->size, this->fd,
  249. (int) this->repack);
  250. int err = fastrpc_mmap(s->domain_id, this->fd, (void *) this->base, 0, this->size, FASTRPC_MAP_FD);
  251. if (err != 0) {
  252. GGML_LOG_ERROR("ggml-hex: buffer mapping failed : domain_id %d size %zu fd %d error 0x%08x\n",
  253. s->domain_id, this->size, this->fd, (unsigned) err);
  254. return false;
  255. }
  256. return true;
  257. }
  258. bool mmap() {
  259. if (this->mapped) {
  260. return true;
  261. }
  262. if (!mmap_to(this->sess)) {
  263. return false;
  264. }
  265. this->mapped = true;
  266. return true;
  267. }
  268. void munmap() {
  269. if (!this->mapped) {
  270. return;
  271. }
  272. fastrpc_munmap(this->sess->domain_id, this->fd, this->base, this->size);
  273. this->mapped = false;
  274. }
  275. ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
  276. size += 4 * 1024; // extra page for padding
  277. this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
  278. if (!this->base) {
  279. GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
  280. throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
  281. }
  282. this->fd = rpcmem_to_fd(this->base);
  283. if (this->fd < 0) {
  284. GGML_LOG_ERROR("ggml-hex: %s failed to get FD for buffer %p\n", sess->name.c_str(), (void *) this->base);
  285. rpcmem_free(this->base);
  286. this->base = NULL;
  287. throw std::runtime_error("ggml-hex: rpcmem_to_fd failed (see log for details)");
  288. }
  289. HEX_VERBOSE("ggml-hex: %s allocated buffer: base %p size %zu fd %d repack %d\n", sess->name.c_str(),
  290. (void *) this->base, size, this->fd, (int) repack);
  291. this->sess = sess;
  292. this->size = size;
  293. this->mapped = false;
  294. this->repack = repack;
  295. }
  296. ~ggml_backend_hexagon_buffer_context() {
  297. munmap();
  298. if (this->base) {
  299. rpcmem_free(this->base);
  300. this->base = NULL;
  301. }
  302. }
  303. ggml_hexagon_session * sess; // primary session
  304. uint8_t * base;
  305. size_t size;
  306. int fd;
  307. bool mapped; // mmap is done
  308. bool repack; // repacked buffer
  309. };
  310. static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
  311. return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer->buft->context)->sess;
  312. }
  313. static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
  314. auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
  315. delete ctx;
  316. }
  317. static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
  318. auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
  319. return ctx->base;
  320. }
  321. static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
  322. auto ctx = static_cast<ggml_backend_hexagon_buffer_context *>(buffer->context);
  323. auto sess = ctx->sess;
  324. HEX_VERBOSE("ggml-hex: %s init-tensor %s : base %p data %p nbytes %zu usage %d repack %d\n", sess->name.c_str(),
  325. tensor->name, (void *) ctx->base, tensor->data, ggml_nbytes(tensor), (int) buffer->usage,
  326. (int) ctx->repack);
  327. if (tensor->view_src != NULL && tensor->view_offs == 0) {
  328. ; // nothing to do for the view
  329. } else {
  330. if (!ctx->mapped) {
  331. ctx->mmap();
  332. }
  333. }
  334. return GGML_STATUS_SUCCESS;
  335. }
  336. // ======== Q4x4x2 ====================
  337. struct x2_q4 {
  338. int v[2];
  339. };
  340. static x2_q4 unpack_q4(uint8_t v) {
  341. x2_q4 x = { (int) (v & 0x0f) - 8, (int) (v >> 4) - 8 };
  342. return x;
  343. }
  344. static void dump_block_q4_0(const block_q4_0 * b, int i) {
  345. HEX_VERBOSE("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0],
  346. unpack_q4(b->qs[1]).v[0], unpack_q4(b->qs[2]).v[0], unpack_q4(b->qs[3]).v[0], unpack_q4(b->qs[12]).v[1],
  347. unpack_q4(b->qs[13]).v[1], unpack_q4(b->qs[14]).v[1], unpack_q4(b->qs[15]).v[1],
  348. GGML_FP16_TO_FP32(b->d));
  349. }
  350. static void dump_packed_block_q4x4x2(const uint8_t * v, unsigned int i, size_t k) {
  351. static const int qk = QK_Q4_0x4x2;
  352. const int dblk_size = 8 * 2; // 8x __fp16
  353. const int qblk_size = qk / 2; // int4
  354. const int qrow_size = k / 2; // int4 (not padded)
  355. const uint8_t * v_q = v + 0; // quants first
  356. const uint8_t * v_d = v + qrow_size; // then scales
  357. const uint8_t * q = v_q + i * qblk_size;
  358. const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
  359. HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
  360. unpack_q4(q[0]).v[0], unpack_q4(q[1]).v[0], unpack_q4(q[2]).v[0], unpack_q4(q[3]).v[0],
  361. unpack_q4(q[60]).v[0], unpack_q4(q[61]).v[0], unpack_q4(q[62]).v[0], unpack_q4(q[63]).v[0],
  362. unpack_q4(q[124]).v[0], unpack_q4(q[125]).v[0], unpack_q4(q[126]).v[0], unpack_q4(q[127]).v[0],
  363. GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
  364. HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
  365. i + 1, unpack_q4(q[0]).v[1], unpack_q4(q[1]).v[1], unpack_q4(q[2]).v[1], unpack_q4(q[3]).v[1],
  366. unpack_q4(q[60]).v[1], unpack_q4(q[61]).v[1], unpack_q4(q[62]).v[1], unpack_q4(q[63]).v[1],
  367. unpack_q4(q[124]).v[1], unpack_q4(q[125]).v[1], unpack_q4(q[126]).v[1], unpack_q4(q[127]).v[1],
  368. GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
  369. }
  370. static void unpack_q4_0_quants(uint8_t * qs, const block_q4_0 * x, unsigned int bi) {
  371. static const int qk = QK4_0;
  372. for (unsigned int i = 0; i < qk / 2; ++i) {
  373. const int x0 = (x->qs[i] & 0x0F);
  374. const int x1 = (x->qs[i] >> 4);
  375. qs[bi * qk + i + 0] = x0;
  376. qs[bi * qk + i + qk / 2] = x1;
  377. }
  378. }
  379. static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi) {
  380. static const int qk = QK4_0;
  381. for (unsigned int i = 0; i < qk / 2; ++i) {
  382. const uint8_t x0 = qs[bi * qk + i + 0];
  383. const uint8_t x1 = qs[bi * qk + i + qk / 2];
  384. x->qs[i] = x0 | (x1 << 4);
  385. }
  386. }
  387. static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
  388. static const int qk = QK_Q4_0x4x2;
  389. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  390. const int dblk_size = 8 * 2; // 8x __fp16
  391. const int qblk_size = qk / 2; // int4
  392. const int qrow_size = k / 2; // int4 (not padded to blocks)
  393. uint8_t * y_q = y + 0; // quants first
  394. uint8_t * y_d = y + qrow_size; // then scales
  395. if (opt_verbose > 2) {
  396. for (int i = 0; i < nb; i++) {
  397. dump_block_q4_0(&x[i * 8 + 0], 0);
  398. dump_block_q4_0(&x[i * 8 + 1], 1);
  399. dump_block_q4_0(&x[i * 8 + 2], 2);
  400. dump_block_q4_0(&x[i * 8 + 3], 3);
  401. dump_block_q4_0(&x[i * 8 + 4], 4);
  402. dump_block_q4_0(&x[i * 8 + 5], 5);
  403. dump_block_q4_0(&x[i * 8 + 6], 6);
  404. dump_block_q4_0(&x[i * 8 + 7], 7);
  405. }
  406. }
  407. // Repack the quants
  408. for (int i = 0; i < nb; i++) {
  409. uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
  410. unpack_q4_0_quants(qs, &x[i * 8 + 0], 0);
  411. unpack_q4_0_quants(qs, &x[i * 8 + 1], 1);
  412. unpack_q4_0_quants(qs, &x[i * 8 + 2], 2);
  413. unpack_q4_0_quants(qs, &x[i * 8 + 3], 3);
  414. unpack_q4_0_quants(qs, &x[i * 8 + 4], 4);
  415. unpack_q4_0_quants(qs, &x[i * 8 + 5], 5);
  416. unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
  417. unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
  418. uint8_t * q = y_q + (i * qblk_size);
  419. for (int j = 0; j < qk / 2; j++) {
  420. q[j] = (qs[j + 128] << 4) | qs[j];
  421. }
  422. }
  423. // Repack the scales
  424. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
  425. // the last block is truncated and overriden by the scales.
  426. for (int i = 0; i < nb; i++) {
  427. // Repack the scales
  428. ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
  429. d[0] = x[i * 8 + 0].d;
  430. d[1] = x[i * 8 + 1].d;
  431. d[2] = x[i * 8 + 2].d;
  432. d[3] = x[i * 8 + 3].d;
  433. d[4] = x[i * 8 + 4].d;
  434. d[5] = x[i * 8 + 5].d;
  435. d[6] = x[i * 8 + 6].d;
  436. d[7] = x[i * 8 + 7].d;
  437. }
  438. if (opt_verbose > 1) {
  439. for (int i = 0; i < nb; i++) {
  440. dump_packed_block_q4x4x2(y, i, k);
  441. }
  442. }
  443. }
  444. static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
  445. static const int qk = QK_Q4_0x4x2;
  446. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  447. const int dblk_size = 8 * 2; // 8x __fp16
  448. const int qblk_size = qk / 2; // int4
  449. const int qrow_size = k / 2; // int4 (not padded to blocks)
  450. const uint8_t * y_q = y + 0; // quants first
  451. const uint8_t * y_d = y + qrow_size; // then scales
  452. if (opt_verbose > 1) {
  453. for (int i = 0; i < nb; i++) {
  454. dump_packed_block_q4x4x2(y, i, k);
  455. }
  456. }
  457. // Unpack the quants
  458. for (int i = 0; i < nb; i++) {
  459. uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
  460. const uint8_t * q = y_q + (i * qblk_size);
  461. for (int j = 0; j < qk / 2; j++) {
  462. qs[j] = q[j] & 0xf;
  463. qs[j + 128] = q[j] >> 4;
  464. }
  465. pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
  466. pack_q4_0_quants(&x[i * 8 + 1], qs, 1);
  467. pack_q4_0_quants(&x[i * 8 + 2], qs, 2);
  468. pack_q4_0_quants(&x[i * 8 + 3], qs, 3);
  469. pack_q4_0_quants(&x[i * 8 + 4], qs, 4);
  470. pack_q4_0_quants(&x[i * 8 + 5], qs, 5);
  471. pack_q4_0_quants(&x[i * 8 + 6], qs, 6);
  472. pack_q4_0_quants(&x[i * 8 + 7], qs, 7);
  473. }
  474. // Repack the scales
  475. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
  476. // the last block is truncated and overriden by the scales.
  477. for (int i = 0; i < nb; i++) {
  478. // Unpack the scales
  479. const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
  480. x[i * 8 + 0].d = d[0];
  481. x[i * 8 + 1].d = d[1];
  482. x[i * 8 + 2].d = d[2];
  483. x[i * 8 + 3].d = d[3];
  484. x[i * 8 + 4].d = d[4];
  485. x[i * 8 + 5].d = d[5];
  486. x[i * 8 + 6].d = d[6];
  487. x[i * 8 + 7].d = d[7];
  488. }
  489. if (opt_verbose > 2) {
  490. for (int i = 0; i < nb; i++) {
  491. dump_block_q4_0(&x[i * 8 + 0], 0);
  492. dump_block_q4_0(&x[i * 8 + 1], 1);
  493. dump_block_q4_0(&x[i * 8 + 2], 2);
  494. dump_block_q4_0(&x[i * 8 + 3], 3);
  495. dump_block_q4_0(&x[i * 8 + 4], 4);
  496. dump_block_q4_0(&x[i * 8 + 5], 5);
  497. dump_block_q4_0(&x[i * 8 + 6], 6);
  498. dump_block_q4_0(&x[i * 8 + 7], 7);
  499. }
  500. }
  501. }
  502. static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
  503. static const int qk = QK_Q4_0x4x2;
  504. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  505. // Init the quants such that they unpack into zeros
  506. uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
  507. memset(qs, 8, sizeof(qs));
  508. for (int i = 0; i < nb; i++) {
  509. pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
  510. pack_q4_0_quants(&x[i * 8 + 1], qs, 1);
  511. pack_q4_0_quants(&x[i * 8 + 2], qs, 2);
  512. pack_q4_0_quants(&x[i * 8 + 3], qs, 3);
  513. pack_q4_0_quants(&x[i * 8 + 4], qs, 4);
  514. pack_q4_0_quants(&x[i * 8 + 5], qs, 5);
  515. pack_q4_0_quants(&x[i * 8 + 6], qs, 6);
  516. pack_q4_0_quants(&x[i * 8 + 7], qs, 7);
  517. }
  518. // Init the scales
  519. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
  520. // the last block is truncated and overriden by the scales.
  521. for (int i = 0; i < nb; i++) {
  522. // Unpack the scales
  523. x[i * 8 + 0].d = 0;
  524. x[i * 8 + 1].d = 0;
  525. x[i * 8 + 2].d = 0;
  526. x[i * 8 + 3].d = 0;
  527. x[i * 8 + 4].d = 0;
  528. x[i * 8 + 5].d = 0;
  529. x[i * 8 + 6].d = 0;
  530. x[i * 8 + 7].d = 0;
  531. }
  532. }
  533. // repack q4_0 data into q4x4x2 tensor
  534. static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
  535. int64_t nrows = ggml_nrows(t);
  536. size_t row_size = ggml_row_size(t->type, t->ne[0]);
  537. size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
  538. size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
  539. void * buf_pd = ggml_aligned_malloc(row_size_pd);
  540. GGML_ASSERT(buf_pd != NULL);
  541. void * buf_rp = ggml_aligned_malloc(row_size_rp);
  542. GGML_ASSERT(buf_rp != NULL);
  543. HEX_VERBOSE("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
  544. t->ne[0], nrows, row_size);
  545. init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
  546. for (int64_t i = 0; i < nrows; i++) {
  547. const uint8_t * src = (const uint8_t *) data + (i * row_size);
  548. uint8_t * dst = (uint8_t *) t->data + (i * row_size);
  549. memcpy(buf_pd, src, row_size);
  550. repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
  551. memcpy(dst, buf_rp, row_size);
  552. }
  553. ggml_aligned_free(buf_pd, row_size_pd);
  554. ggml_aligned_free(buf_rp, row_size_rp);
  555. }
  556. // repack q4x4x2 tensor into q4_0 data
  557. static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) {
  558. int64_t nrows = ggml_nrows(t);
  559. size_t row_size = ggml_row_size(t->type, t->ne[0]);
  560. size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
  561. size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
  562. void * buf_pd = ggml_aligned_malloc(row_size_pd);
  563. GGML_ASSERT(buf_pd != NULL);
  564. void * buf_rp = ggml_aligned_malloc(row_size_rp);
  565. GGML_ASSERT(buf_rp != NULL);
  566. HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
  567. t->ne[0], nrows, row_size);
  568. memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
  569. for (int64_t i = 0; i < nrows; i++) {
  570. const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
  571. uint8_t * dst = (uint8_t *) data + (i * row_size);
  572. memcpy(buf_pd, src, row_size);
  573. unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
  574. memcpy(dst, buf_rp, row_size);
  575. }
  576. ggml_aligned_free(buf_pd, row_size_pd);
  577. ggml_aligned_free(buf_rp, row_size_rp);
  578. }
  579. // ======== Q8x4x2 ====================
  580. static void dump_block_q8_0(const block_q8_0 * b, int i) {
  581. HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
  582. b->qs[3], b->qs[28], b->qs[29], b->qs[30], b->qs[31], GGML_FP16_TO_FP32(b->d));
  583. }
  584. static void dump_packed_block_q8x4x2(const uint8_t * v, unsigned int i, size_t k) {
  585. static const int qk = QK_Q8_0x4x2;
  586. const int dblk_size = 8 * 2; // 8x __fp16
  587. const int qblk_size = qk; // int8
  588. const int qrow_size = k; // int8 (not padded)
  589. const uint8_t * v_q = v + 0; // quants first
  590. const uint8_t * v_d = v + qrow_size; // then scales
  591. const uint8_t * q = v_q + i * qblk_size;
  592. const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size);
  593. HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
  594. q[0], q[1], q[2], q[3], q[60], q[61], q[62], q[63], q[124], q[125], q[126], q[127],
  595. GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3]));
  596. HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
  597. i + 1, q[128], q[129], q[130], q[131], q[192], q[193], q[194], q[195], q[252], q[253], q[254], q[255],
  598. GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7]));
  599. }
  600. static void unpack_q8_0_quants(uint8_t * qs, const block_q8_0 * x, unsigned int bi) {
  601. static const int qk = QK8_0;
  602. for (unsigned int i = 0; i < qk; ++i) {
  603. qs[bi * qk + i] = x->qs[i];
  604. }
  605. }
  606. static void pack_q8_0_quants(block_q8_0 * x, const uint8_t * qs, unsigned int bi) {
  607. static const int qk = QK8_0;
  608. for (unsigned int i = 0; i < qk; ++i) {
  609. x->qs[i] = qs[bi * qk + i];
  610. }
  611. }
  612. static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
  613. static const int qk = QK_Q8_0x4x2;
  614. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  615. const int dblk_size = 8 * 2; // 8x __fp16
  616. const int qblk_size = qk; // int8
  617. const int qrow_size = k; // int8 (not padded to blocks)
  618. uint8_t * y_q = y + 0; // quants first
  619. uint8_t * y_d = y + qrow_size; // then scales
  620. if (opt_verbose > 2) {
  621. for (int i = 0; i < nb; i++) {
  622. dump_block_q8_0(&x[i * 8 + 0], 0);
  623. dump_block_q8_0(&x[i * 8 + 1], 1);
  624. dump_block_q8_0(&x[i * 8 + 2], 2);
  625. dump_block_q8_0(&x[i * 8 + 3], 3);
  626. dump_block_q8_0(&x[i * 8 + 4], 4);
  627. dump_block_q8_0(&x[i * 8 + 5], 5);
  628. dump_block_q8_0(&x[i * 8 + 6], 6);
  629. dump_block_q8_0(&x[i * 8 + 7], 7);
  630. }
  631. }
  632. // Repack the quants
  633. for (int i = 0; i < nb; i++) {
  634. uint8_t qs[QK_Q8_0x4x2]; // unpacked quants
  635. unpack_q8_0_quants(qs, &x[i * 8 + 0], 0);
  636. unpack_q8_0_quants(qs, &x[i * 8 + 1], 1);
  637. unpack_q8_0_quants(qs, &x[i * 8 + 2], 2);
  638. unpack_q8_0_quants(qs, &x[i * 8 + 3], 3);
  639. unpack_q8_0_quants(qs, &x[i * 8 + 4], 4);
  640. unpack_q8_0_quants(qs, &x[i * 8 + 5], 5);
  641. unpack_q8_0_quants(qs, &x[i * 8 + 6], 6);
  642. unpack_q8_0_quants(qs, &x[i * 8 + 7], 7);
  643. uint8_t * q = y_q + (i * qblk_size);
  644. for (int j = 0; j < qk; j++) {
  645. q[j] = qs[j];
  646. }
  647. }
  648. // Repack the scales
  649. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
  650. // the last block is truncated and overriden by the scales.
  651. for (int i = 0; i < nb; i++) {
  652. // Repack the scales
  653. ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
  654. d[0] = x[i * 8 + 0].d;
  655. d[1] = x[i * 8 + 1].d;
  656. d[2] = x[i * 8 + 2].d;
  657. d[3] = x[i * 8 + 3].d;
  658. d[4] = x[i * 8 + 4].d;
  659. d[5] = x[i * 8 + 5].d;
  660. d[6] = x[i * 8 + 6].d;
  661. d[7] = x[i * 8 + 7].d;
  662. }
  663. if (opt_verbose > 1) {
  664. for (int i = 0; i < nb; i++) {
  665. dump_packed_block_q8x4x2(y, i, k);
  666. }
  667. }
  668. }
  669. static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
  670. static const int qk = QK_Q8_0x4x2;
  671. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  672. const int dblk_size = 8 * 2; // 8x __fp16
  673. const int qblk_size = qk; // int8
  674. const int qrow_size = k; // int8 (not padded to blocks)
  675. const uint8_t * y_q = y + 0; // quants first
  676. const uint8_t * y_d = y + qrow_size; // then scales
  677. if (opt_verbose > 1) {
  678. for (int i = 0; i < nb; i++) {
  679. dump_packed_block_q8x4x2(y, i, k);
  680. }
  681. }
  682. // Unpack the quants
  683. for (int i = 0; i < nb; i++) {
  684. uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
  685. const uint8_t * q = y_q + (i * qblk_size);
  686. for (int j = 0; j < qk; j++) {
  687. qs[j] = q[j];
  688. }
  689. pack_q8_0_quants(&x[i * 8 + 0], qs, 0);
  690. pack_q8_0_quants(&x[i * 8 + 1], qs, 1);
  691. pack_q8_0_quants(&x[i * 8 + 2], qs, 2);
  692. pack_q8_0_quants(&x[i * 8 + 3], qs, 3);
  693. pack_q8_0_quants(&x[i * 8 + 4], qs, 4);
  694. pack_q8_0_quants(&x[i * 8 + 5], qs, 5);
  695. pack_q8_0_quants(&x[i * 8 + 6], qs, 6);
  696. pack_q8_0_quants(&x[i * 8 + 7], qs, 7);
  697. }
  698. // Repack the scales
  699. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
  700. // the last block is truncated and overriden by the scales.
  701. for (int i = 0; i < nb; i++) {
  702. // Unpack the scales
  703. const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
  704. x[i * 8 + 0].d = d[0];
  705. x[i * 8 + 1].d = d[1];
  706. x[i * 8 + 2].d = d[2];
  707. x[i * 8 + 3].d = d[3];
  708. x[i * 8 + 4].d = d[4];
  709. x[i * 8 + 5].d = d[5];
  710. x[i * 8 + 6].d = d[6];
  711. x[i * 8 + 7].d = d[7];
  712. }
  713. if (opt_verbose > 2) {
  714. for (int i = 0; i < nb; i++) {
  715. dump_block_q8_0(&x[i * 8 + 0], 0);
  716. dump_block_q8_0(&x[i * 8 + 1], 1);
  717. dump_block_q8_0(&x[i * 8 + 2], 2);
  718. dump_block_q8_0(&x[i * 8 + 3], 3);
  719. dump_block_q8_0(&x[i * 8 + 4], 4);
  720. dump_block_q8_0(&x[i * 8 + 5], 5);
  721. dump_block_q8_0(&x[i * 8 + 6], 6);
  722. dump_block_q8_0(&x[i * 8 + 7], 7);
  723. }
  724. }
  725. }
  726. static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
  727. static const int qk = QK_Q8_0x4x2;
  728. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  729. // Init the quants such that they unpack into zeros
  730. uint8_t qs[QK_Q8_0x4x2]; // unpacked quants
  731. memset(qs, 0, sizeof(qs));
  732. for (int i = 0; i < nb; i++) {
  733. pack_q8_0_quants(&x[i * 8 + 0], qs, 0);
  734. pack_q8_0_quants(&x[i * 8 + 1], qs, 1);
  735. pack_q8_0_quants(&x[i * 8 + 2], qs, 2);
  736. pack_q8_0_quants(&x[i * 8 + 3], qs, 3);
  737. pack_q8_0_quants(&x[i * 8 + 4], qs, 4);
  738. pack_q8_0_quants(&x[i * 8 + 5], qs, 5);
  739. pack_q8_0_quants(&x[i * 8 + 6], qs, 6);
  740. pack_q8_0_quants(&x[i * 8 + 7], qs, 7);
  741. }
  742. // Init the scales
  743. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
  744. // the last block is truncated and overriden by the scales.
  745. for (int i = 0; i < nb; i++) {
  746. // Unpack the scales
  747. x[i * 8 + 0].d = 0;
  748. x[i * 8 + 1].d = 0;
  749. x[i * 8 + 2].d = 0;
  750. x[i * 8 + 3].d = 0;
  751. x[i * 8 + 4].d = 0;
  752. x[i * 8 + 5].d = 0;
  753. x[i * 8 + 6].d = 0;
  754. x[i * 8 + 7].d = 0;
  755. }
  756. }
  757. // repack q8_0 data into q8x4x2 tensor
  758. static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) {
  759. int64_t nrows = ggml_nrows(t);
  760. size_t row_size = ggml_row_size(t->type, t->ne[0]);
  761. size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
  762. size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
  763. void * buf_pd = ggml_aligned_malloc(row_size_pd);
  764. GGML_ASSERT(buf_pd != NULL);
  765. void * buf_rp = ggml_aligned_malloc(row_size_rp);
  766. GGML_ASSERT(buf_rp != NULL);
  767. HEX_VERBOSE("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
  768. t->ne[0], nrows, row_size);
  769. init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
  770. for (int64_t i = 0; i < nrows; i++) {
  771. const uint8_t * src = (const uint8_t *) data + (i * row_size);
  772. uint8_t * dst = (uint8_t *) t->data + (i * row_size);
  773. memcpy(buf_pd, src, row_size);
  774. repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
  775. memcpy(dst, buf_rp, row_size);
  776. }
  777. ggml_aligned_free(buf_pd, row_size_pd);
  778. ggml_aligned_free(buf_rp, row_size_rp);
  779. }
  780. // repack q8x4x2 tensor into q8_0 data
  781. static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) {
  782. int64_t nrows = ggml_nrows(t);
  783. size_t row_size = ggml_row_size(t->type, t->ne[0]);
  784. size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
  785. size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
  786. void * buf_pd = ggml_aligned_malloc(row_size_pd);
  787. GGML_ASSERT(buf_pd != NULL);
  788. void * buf_rp = ggml_aligned_malloc(row_size_rp);
  789. GGML_ASSERT(buf_rp != NULL);
  790. HEX_VERBOSE("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
  791. t->ne[0], nrows, row_size);
  792. memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
  793. for (int64_t i = 0; i < nrows; i++) {
  794. const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
  795. uint8_t * dst = (uint8_t *) data + (i * row_size);
  796. memcpy(buf_pd, src, row_size);
  797. unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
  798. memcpy(dst, buf_rp, row_size);
  799. }
  800. ggml_aligned_free(buf_pd, row_size_pd);
  801. ggml_aligned_free(buf_rp, row_size_rp);
  802. }
  803. // ======== MXFP4x4x2 ====================
  804. struct x2_mxfp4 {
  805. int v[2];
  806. };
  807. static x2_mxfp4 unpack_mxfp4(uint8_t v) {
  808. x2_mxfp4 x;
  809. x.v[0] = kvalues_mxfp4[(v & 0x0f)];
  810. x.v[1] = kvalues_mxfp4[(v >> 4)];
  811. return x;
  812. }
  813. static void dump_block_mxfp4(const block_mxfp4 * b, int i) {
  814. HEX_VERBOSE("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0],
  815. unpack_mxfp4(b->qs[1]).v[0], unpack_mxfp4(b->qs[2]).v[0], unpack_mxfp4(b->qs[3]).v[0],
  816. unpack_mxfp4(b->qs[12]).v[1], unpack_mxfp4(b->qs[13]).v[1], unpack_mxfp4(b->qs[14]).v[1],
  817. unpack_mxfp4(b->qs[15]).v[1], GGML_E8M0_TO_FP32_HALF(b->e));
  818. }
  819. static void dump_packed_block_mxfp4x4x2(const uint8_t * v, unsigned int i, size_t k) {
  820. static const int qk = QK_MXFP4x4x2;
  821. const int eblk_size = 8 * 1; // 8x E8M0
  822. const int qblk_size = qk / 2; // int4
  823. const int qrow_size = k / 2; // int4 (not padded)
  824. const uint8_t * v_q = v + 0; // quants first
  825. const uint8_t * v_e = v + qrow_size; // then scales
  826. const uint8_t * q = v_q + i * qblk_size;
  827. const uint8_t * e = (const uint8_t *) (v_e + i * eblk_size);
  828. HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i,
  829. unpack_mxfp4(q[0]).v[0], unpack_mxfp4(q[1]).v[0], unpack_mxfp4(q[2]).v[0], unpack_mxfp4(q[3]).v[0],
  830. unpack_mxfp4(q[60]).v[0], unpack_mxfp4(q[61]).v[0], unpack_mxfp4(q[62]).v[0], unpack_mxfp4(q[63]).v[0],
  831. unpack_mxfp4(q[124]).v[0], unpack_mxfp4(q[125]).v[0], unpack_mxfp4(q[126]).v[0],
  832. unpack_mxfp4(q[127]).v[0], GGML_E8M0_TO_FP32_HALF(e[0]), GGML_E8M0_TO_FP32_HALF(e[1]),
  833. GGML_E8M0_TO_FP32_HALF(e[2]), GGML_E8M0_TO_FP32_HALF(e[3]));
  834. HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n",
  835. i + 1, unpack_mxfp4(q[0]).v[1], unpack_mxfp4(q[1]).v[1], unpack_mxfp4(q[2]).v[1],
  836. unpack_mxfp4(q[3]).v[1], unpack_mxfp4(q[60]).v[1], unpack_mxfp4(q[61]).v[1], unpack_mxfp4(q[62]).v[1],
  837. unpack_mxfp4(q[63]).v[1], unpack_mxfp4(q[124]).v[1], unpack_mxfp4(q[125]).v[1],
  838. unpack_mxfp4(q[126]).v[1], unpack_mxfp4(q[127]).v[1], GGML_E8M0_TO_FP32_HALF(e[4]),
  839. GGML_E8M0_TO_FP32_HALF(e[5]), GGML_E8M0_TO_FP32_HALF(e[6]), GGML_E8M0_TO_FP32_HALF(e[7]));
  840. }
  841. static void unpack_mxfp4_quants(uint8_t * qs, const block_mxfp4 * x, unsigned int bi) {
  842. static const int qk = QK_MXFP4;
  843. for (unsigned int i = 0; i < qk / 2; ++i) {
  844. const uint8_t x0 = (x->qs[i] & 0x0F);
  845. const uint8_t x1 = (x->qs[i] >> 4);
  846. qs[bi * qk + i + 0] = x0;
  847. qs[bi * qk + i + qk / 2] = x1;
  848. }
  849. }
  850. static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int bi) {
  851. static const int qk = QK4_0;
  852. for (unsigned int i = 0; i < qk / 2; ++i) {
  853. const uint8_t x0 = qs[bi * qk + i + 0];
  854. const uint8_t x1 = qs[bi * qk + i + qk / 2];
  855. x->qs[i] = x0 | (x1 << 4);
  856. }
  857. }
  858. static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
  859. static const int qk = QK_MXFP4x4x2;
  860. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  861. const int eblk_size = 8 * 1; // 8x E8M0
  862. const int qblk_size = qk / 2; // int4
  863. const int qrow_size = k / 2; // int4 (not padded to blocks)
  864. uint8_t * y_q = y + 0; // quants first
  865. uint8_t * y_e = y + qrow_size; // then scales
  866. if (opt_verbose > 2) {
  867. for (int i = 0; i < nb; i++) {
  868. dump_block_mxfp4(&x[i * 8 + 0], 0);
  869. dump_block_mxfp4(&x[i * 8 + 1], 1);
  870. dump_block_mxfp4(&x[i * 8 + 2], 2);
  871. dump_block_mxfp4(&x[i * 8 + 3], 3);
  872. dump_block_mxfp4(&x[i * 8 + 4], 4);
  873. dump_block_mxfp4(&x[i * 8 + 5], 5);
  874. dump_block_mxfp4(&x[i * 8 + 6], 6);
  875. dump_block_mxfp4(&x[i * 8 + 7], 7);
  876. }
  877. }
  878. // Repack the quants
  879. for (int i = 0; i < nb; i++) {
  880. uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
  881. unpack_mxfp4_quants(qs, &x[i * 8 + 0], 0);
  882. unpack_mxfp4_quants(qs, &x[i * 8 + 1], 1);
  883. unpack_mxfp4_quants(qs, &x[i * 8 + 2], 2);
  884. unpack_mxfp4_quants(qs, &x[i * 8 + 3], 3);
  885. unpack_mxfp4_quants(qs, &x[i * 8 + 4], 4);
  886. unpack_mxfp4_quants(qs, &x[i * 8 + 5], 5);
  887. unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
  888. unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
  889. uint8_t * q = y_q + (i * qblk_size);
  890. for (int j = 0; j < qk / 2; j++) {
  891. q[j] = (qs[j + 128] << 4) | qs[j];
  892. }
  893. }
  894. // Repack the scales
  895. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
  896. // the last block is truncated and overriden by the scales.
  897. for (int i = 0; i < nb; i++) {
  898. // Repack the scales
  899. uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
  900. e[0] = x[i * 8 + 0].e;
  901. e[1] = x[i * 8 + 1].e;
  902. e[2] = x[i * 8 + 2].e;
  903. e[3] = x[i * 8 + 3].e;
  904. e[4] = x[i * 8 + 4].e;
  905. e[5] = x[i * 8 + 5].e;
  906. e[6] = x[i * 8 + 6].e;
  907. e[7] = x[i * 8 + 7].e;
  908. }
  909. if (opt_verbose > 1) {
  910. for (int i = 0; i < nb; i++) {
  911. dump_packed_block_mxfp4x4x2(y, i, k);
  912. }
  913. }
  914. }
  915. static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
  916. static const int qk = QK_MXFP4x4x2;
  917. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  918. const int eblk_size = 8 * 1; // 8x E8M0
  919. const int qblk_size = qk / 2; // int4
  920. const int qrow_size = k / 2; // int4 (not padded to blocks)
  921. const uint8_t * y_q = y + 0; // quants first
  922. const uint8_t * y_e = y + qrow_size; // then scales
  923. if (opt_verbose > 1) {
  924. for (int i = 0; i < nb; i++) {
  925. dump_packed_block_mxfp4x4x2(y, i, k);
  926. }
  927. }
  928. // Unpack the quants
  929. for (int i = 0; i < nb; i++) {
  930. uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
  931. const uint8_t * q = y_q + (i * qblk_size);
  932. for (int j = 0; j < qk / 2; j++) {
  933. qs[j] = q[j] & 0xf;
  934. qs[j + 128] = q[j] >> 4;
  935. }
  936. pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
  937. pack_mxfp4_quants(&x[i * 8 + 1], qs, 1);
  938. pack_mxfp4_quants(&x[i * 8 + 2], qs, 2);
  939. pack_mxfp4_quants(&x[i * 8 + 3], qs, 3);
  940. pack_mxfp4_quants(&x[i * 8 + 4], qs, 4);
  941. pack_mxfp4_quants(&x[i * 8 + 5], qs, 5);
  942. pack_mxfp4_quants(&x[i * 8 + 6], qs, 6);
  943. pack_mxfp4_quants(&x[i * 8 + 7], qs, 7);
  944. }
  945. // Repack the scales
  946. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
  947. // the last block is truncated and overriden by the scales.
  948. for (int i = 0; i < nb; i++) {
  949. // Unpack the scales
  950. const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
  951. x[i * 8 + 0].e = e[0];
  952. x[i * 8 + 1].e = e[1];
  953. x[i * 8 + 2].e = e[2];
  954. x[i * 8 + 3].e = e[3];
  955. x[i * 8 + 4].e = e[4];
  956. x[i * 8 + 5].e = e[5];
  957. x[i * 8 + 6].e = e[6];
  958. x[i * 8 + 7].e = e[7];
  959. }
  960. if (opt_verbose > 2) {
  961. for (int i = 0; i < nb; i++) {
  962. dump_block_mxfp4(&x[i * 8 + 0], 0);
  963. dump_block_mxfp4(&x[i * 8 + 1], 1);
  964. dump_block_mxfp4(&x[i * 8 + 2], 2);
  965. dump_block_mxfp4(&x[i * 8 + 3], 3);
  966. dump_block_mxfp4(&x[i * 8 + 4], 4);
  967. dump_block_mxfp4(&x[i * 8 + 5], 5);
  968. dump_block_mxfp4(&x[i * 8 + 6], 6);
  969. dump_block_mxfp4(&x[i * 8 + 7], 7);
  970. }
  971. }
  972. }
  973. static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
  974. static const int qk = QK_MXFP4x4x2;
  975. const int nb = (k + qk - 1) / qk; // number of blocks (padded)
  976. // Init the quants such that they unpack into zeros
  977. uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
  978. memset(qs, 0, sizeof(qs));
  979. for (int i = 0; i < nb; i++) {
  980. pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
  981. pack_mxfp4_quants(&x[i * 8 + 1], qs, 1);
  982. pack_mxfp4_quants(&x[i * 8 + 2], qs, 2);
  983. pack_mxfp4_quants(&x[i * 8 + 3], qs, 3);
  984. pack_mxfp4_quants(&x[i * 8 + 4], qs, 4);
  985. pack_mxfp4_quants(&x[i * 8 + 5], qs, 5);
  986. pack_mxfp4_quants(&x[i * 8 + 6], qs, 6);
  987. pack_mxfp4_quants(&x[i * 8 + 7], qs, 7);
  988. }
  989. // Init the scales
  990. // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
  991. // the last block is truncated and overriden by the scales.
  992. for (int i = 0; i < nb; i++) {
  993. // Unpack the scales
  994. x[i * 8 + 0].e = 0;
  995. x[i * 8 + 1].e = 0;
  996. x[i * 8 + 2].e = 0;
  997. x[i * 8 + 3].e = 0;
  998. x[i * 8 + 4].e = 0;
  999. x[i * 8 + 5].e = 0;
  1000. x[i * 8 + 6].e = 0;
  1001. x[i * 8 + 7].e = 0;
  1002. }
  1003. }
  1004. // repack mxfp4 data into mxfp4x4x2 tensor
  1005. static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t size) {
  1006. int64_t nrows = ggml_nrows(t);
  1007. size_t row_size = ggml_row_size(t->type, t->ne[0]);
  1008. size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
  1009. size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
  1010. void * buf_pd = ggml_aligned_malloc(row_size_pd);
  1011. GGML_ASSERT(buf_pd != NULL);
  1012. void * buf_rp = ggml_aligned_malloc(row_size_rp);
  1013. GGML_ASSERT(buf_rp != NULL);
  1014. HEX_VERBOSE("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
  1015. size, t->ne[0], nrows, row_size);
  1016. init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
  1017. for (int64_t i = 0; i < nrows; i++) {
  1018. const uint8_t * src = (const uint8_t *) data + (i * row_size);
  1019. uint8_t * dst = (uint8_t *) t->data + (i * row_size);
  1020. memcpy(buf_pd, src, row_size);
  1021. repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
  1022. memcpy(dst, buf_rp, row_size);
  1023. }
  1024. ggml_aligned_free(buf_pd, row_size_pd);
  1025. ggml_aligned_free(buf_rp, row_size_rp);
  1026. }
  1027. // repack mxfp4x4x2 tensor into mxfp4 data
  1028. static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t size) {
  1029. int64_t nrows = ggml_nrows(t);
  1030. size_t row_size = ggml_row_size(t->type, t->ne[0]);
  1031. size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
  1032. size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
  1033. void * buf_pd = ggml_aligned_malloc(row_size_pd);
  1034. GGML_ASSERT(buf_pd != NULL);
  1035. void * buf_rp = ggml_aligned_malloc(row_size_rp);
  1036. GGML_ASSERT(buf_rp != NULL);
  1037. HEX_VERBOSE("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data,
  1038. size, t->ne[0], nrows, row_size);
  1039. memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
  1040. for (int64_t i = 0; i < nrows; i++) {
  1041. const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
  1042. uint8_t * dst = (uint8_t *) data + (i * row_size);
  1043. memcpy(buf_pd, src, row_size);
  1044. unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
  1045. memcpy(dst, buf_rp, row_size);
  1046. }
  1047. ggml_aligned_free(buf_pd, row_size_pd);
  1048. ggml_aligned_free(buf_rp, row_size_rp);
  1049. }
  1050. static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
  1051. ggml_tensor * tensor,
  1052. const void * data,
  1053. size_t offset,
  1054. size_t size) {
  1055. auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
  1056. auto sess = ctx->sess;
  1057. HEX_VERBOSE("ggml-hex: %s set-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
  1058. offset, size);
  1059. switch (tensor->type) {
  1060. case GGML_TYPE_Q4_0:
  1061. GGML_ASSERT(offset == 0);
  1062. GGML_ASSERT(size == ggml_nbytes(tensor));
  1063. repack_q4_0_q4x4x2(tensor, data, size);
  1064. break;
  1065. case GGML_TYPE_Q8_0:
  1066. GGML_ASSERT(offset == 0);
  1067. GGML_ASSERT(size == ggml_nbytes(tensor));
  1068. repack_q8_0_q8x4x2(tensor, data, size);
  1069. break;
  1070. case GGML_TYPE_MXFP4:
  1071. GGML_ASSERT(offset == 0);
  1072. GGML_ASSERT(size == ggml_nbytes(tensor));
  1073. repack_mxfp4_mxfp4x4x2(tensor, data, size);
  1074. break;
  1075. default:
  1076. memcpy((char *) tensor->data + offset, data, size);
  1077. break;
  1078. }
  1079. }
  1080. static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
  1081. const ggml_tensor * tensor,
  1082. void * data,
  1083. size_t offset,
  1084. size_t size) {
  1085. auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
  1086. auto sess = ctx->sess;
  1087. HEX_VERBOSE("ggml-hex: %s get-tensor %s : data %p offset %zu size %zu\n", sess->name.c_str(), tensor->name, data,
  1088. offset, size);
  1089. switch (tensor->type) {
  1090. case GGML_TYPE_Q4_0:
  1091. GGML_ASSERT(offset == 0);
  1092. GGML_ASSERT(size == ggml_nbytes(tensor));
  1093. repack_q4x4x2_q4_0(data, tensor, size);
  1094. break;
  1095. case GGML_TYPE_Q8_0:
  1096. GGML_ASSERT(offset == 0);
  1097. GGML_ASSERT(size == ggml_nbytes(tensor));
  1098. repack_q8x4x2_q8_0(data, tensor, size);
  1099. break;
  1100. case GGML_TYPE_MXFP4:
  1101. GGML_ASSERT(offset == 0);
  1102. GGML_ASSERT(size == ggml_nbytes(tensor));
  1103. repack_mxfp4x4x2_mxfp4(data, tensor, size);
  1104. break;
  1105. default:
  1106. memcpy(data, (const char *) tensor->data + offset, size);
  1107. break;
  1108. }
  1109. }
  1110. static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
  1111. const struct ggml_tensor * src,
  1112. struct ggml_tensor * dst) {
  1113. GGML_UNUSED(buffer);
  1114. GGML_UNUSED(src);
  1115. GGML_UNUSED(dst);
  1116. // we might optimize this later, for now take the slow path (ie get/set_tensor)
  1117. return false;
  1118. }
  1119. static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
  1120. auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context;
  1121. auto sess = ctx->sess;
  1122. HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
  1123. memset(ctx->base, value, ctx->size);
  1124. }
  1125. static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
  1126. /* .free_buffer = */ ggml_backend_hexagon_buffer_free_buffer,
  1127. /* .get_base = */ ggml_backend_hexagon_buffer_get_base,
  1128. /* .init_tensor = */ ggml_backend_hexagon_buffer_init_tensor,
  1129. /* .memset_tensor = */ NULL,
  1130. /* .set_tensor = */ ggml_backend_hexagon_buffer_set_tensor,
  1131. /* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor,
  1132. /* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor,
  1133. /* .clear = */ ggml_backend_hexagon_buffer_clear,
  1134. /* .reset = */ NULL,
  1135. };
  1136. // ** backend buffer type
  1137. static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
  1138. return static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->name.c_str();
  1139. }
  1140. static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
  1141. ggml_backend_buffer_type_t buffer_type, size_t size) {
  1142. auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
  1143. try {
  1144. ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, false /*repack*/);
  1145. return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
  1146. } catch (std::exception const &exc) {
  1147. GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
  1148. return nullptr;
  1149. }
  1150. }
  1151. static ggml_backend_buffer_t ggml_backend_hexagon_repack_buffer_type_alloc_buffer(
  1152. ggml_backend_buffer_type_t buffer_type, size_t size) {
  1153. auto sess = static_cast<ggml_backend_hexagon_buffer_type_context *>(buffer_type->context)->sess;
  1154. try {
  1155. ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context(sess, size, true /*repack*/);
  1156. return ggml_backend_buffer_init(buffer_type, ggml_backend_hexagon_buffer_interface, ctx, size);
  1157. } catch (std::exception const &exc) {
  1158. GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer context: %s\n", sess->name.c_str(), exc.what());
  1159. return nullptr;
  1160. }
  1161. }
  1162. static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
  1163. return 128; // HVX alignment
  1164. GGML_UNUSED(buffer_type);
  1165. }
  1166. static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) {
  1167. return ggml_nbytes(t);
  1168. }
  1169. static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
  1170. return 1 * 1024 * 1024 * 1024; // 1GB per buffer
  1171. GGML_UNUSED(buffer_type);
  1172. }
  1173. static bool ggml_backend_hexagon_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
  1174. return opt_hostbuf;
  1175. GGML_UNUSED(buft);
  1176. }
  1177. static bool ggml_backend_hexagon_repack_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
  1178. return false;
  1179. GGML_UNUSED(buft);
  1180. }
  1181. static ggml_backend_buffer_type_i ggml_backend_hexagon_buffer_type_interface = {
  1182. /* .get_name = */ ggml_backend_hexagon_buffer_type_name,
  1183. /* .alloc_buffer = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
  1184. /* .get_alignment = */ ggml_backend_hexagon_buffer_type_get_alignment,
  1185. /* .get_max_size = */ ggml_backend_hexagon_buffer_type_get_max_size,
  1186. /* .get_alloc_size = */ ggml_backend_hexagon_buffer_type_get_alloc_size,
  1187. /* .is_host = */ ggml_backend_hexagon_buffer_type_is_host,
  1188. };
  1189. static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interface = {
  1190. /* .get_name = */ ggml_backend_hexagon_buffer_type_name,
  1191. /* .alloc_buffer = */ ggml_backend_hexagon_repack_buffer_type_alloc_buffer,
  1192. /* .get_alignment = */ ggml_backend_hexagon_buffer_type_get_alignment,
  1193. /* .get_max_size = */ ggml_backend_hexagon_buffer_type_get_max_size,
  1194. /* .get_alloc_size = */ ggml_backend_hexagon_buffer_type_get_alloc_size,
  1195. /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host,
  1196. };
  1197. void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
  1198. this->valid_session = false;
  1199. this->valid_handle = false;
  1200. this->valid_queue = false;
  1201. this->valid_iface = false;
  1202. this->domain_id = 3; // Default for CDSP, updated after the session is created
  1203. this->session_id = 0; // Default for CDSP, updated after the session is created
  1204. this->dev_id = dev_id;
  1205. this->name = std::string("HTP") + std::to_string(dev_id);
  1206. this->op_pending = 0;
  1207. this->prof_usecs = 0;
  1208. this->prof_cycles = 0;
  1209. this->prof_pkts = 0;
  1210. GGML_LOG_INFO("ggml-hex: allocating new session: %s\n", this->name.c_str());
  1211. domain * my_domain = get_domain(this->domain_id);
  1212. if (my_domain == NULL) {
  1213. GGML_LOG_ERROR("ggml-hex: unable to get domain struct for CDSP\n");
  1214. throw std::runtime_error("ggml-hex: failed to get CDSP domain (see log for details)");
  1215. }
  1216. // Create new session
  1217. if (dev_id != 0) {
  1218. struct remote_rpc_reserve_new_session n;
  1219. n.domain_name_len = strlen(CDSP_DOMAIN_NAME);
  1220. n.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
  1221. n.session_name = const_cast<char *>(this->name.c_str());
  1222. n.session_name_len = this->name.size();
  1223. int err = remote_session_control(FASTRPC_RESERVE_NEW_SESSION, (void *) &n, sizeof(n));
  1224. if (err != AEE_SUCCESS) {
  1225. GGML_LOG_ERROR("ggml-hex: failed to reserve new session %d : error 0x%x\n", dev_id, err);
  1226. throw std::runtime_error("ggml-hex: remote_session_control(new-sess) failed (see log for details)");
  1227. }
  1228. // Save the IDs
  1229. this->session_id = n.session_id;
  1230. this->domain_id = n.effective_domain_id;
  1231. this->valid_session = true;
  1232. }
  1233. // Get session URI
  1234. char htp_uri[256];
  1235. sprintf(htp_uri, "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
  1236. char session_uri[256];
  1237. {
  1238. struct remote_rpc_get_uri u;
  1239. u.session_id = this->session_id;
  1240. u.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
  1241. u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
  1242. u.module_uri = const_cast<char *>(htp_uri);
  1243. u.module_uri_len = strlen(htp_uri);
  1244. u.uri = session_uri;
  1245. u.uri_len = sizeof(session_uri);
  1246. int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
  1247. if (err != AEE_SUCCESS) {
  1248. GGML_LOG_ERROR("ggml-hex: failed to get URI for session %d : error 0x%x\n", dev_id, err);
  1249. throw std::runtime_error("ggml-hex: remote_session_control(get-uri) failed (see log for details)");
  1250. }
  1251. }
  1252. // Enable Unsigned PD
  1253. {
  1254. struct remote_rpc_control_unsigned_module u;
  1255. u.domain = this->domain_id;
  1256. u.enable = 1;
  1257. int err = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *) &u, sizeof(u));
  1258. if (err != AEE_SUCCESS) {
  1259. GGML_LOG_ERROR("ggml-hex: failed to enable unsigned PD for session %d : error 0x%x\n", dev_id, err);
  1260. throw std::runtime_error("ggml-hex: remote_session_control(unsign) failed (see log for details)");
  1261. }
  1262. }
  1263. // Open session
  1264. int err = htp_iface_open(session_uri, &this->handle);
  1265. if (err != AEE_SUCCESS) {
  1266. GGML_LOG_ERROR("ggml-hex: failed to open session %d : error 0x%x\n", dev_id, err);
  1267. throw std::runtime_error("ggml-hex: failed to open session (see log for details)");
  1268. }
  1269. this->valid_handle = true;
  1270. GGML_LOG_INFO("ggml-hex: new session: %s : session-id %d domain-id %d uri %s handle 0x%lx\n", this->name.c_str(),
  1271. this->session_id, this->domain_id, session_uri, (unsigned long) this->handle);
  1272. // Enable FastRPC QoS mode
  1273. {
  1274. struct remote_rpc_control_latency l;
  1275. l.enable = 1;
  1276. int err = remote_handle64_control(this->handle, DSPRPC_CONTROL_LATENCY, (void *) &l, sizeof(l));
  1277. if (err != 0) {
  1278. GGML_LOG_WARN("ggml-hex: failed to enable fastrpc QOS mode: 0x%08x\n", (unsigned) err);
  1279. }
  1280. }
  1281. // Now let's setup the DSP queue
  1282. err = dspqueue_create(this->domain_id,
  1283. 0, // Flags
  1284. 128 * 1024, // Request queue size (in bytes)
  1285. 64 * 1024, // Response queue size (in bytes)
  1286. htp_packet_callback, htp_error_callback,
  1287. (void *) this, // Callback context
  1288. &queue);
  1289. if (err != 0) {
  1290. GGML_LOG_ERROR("ggml-hex: %s dspqueue_create failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
  1291. throw std::runtime_error("ggml-hex: failed to create dspqueue (see log for details)");
  1292. }
  1293. this->valid_queue = true;
  1294. // Export queue for use on the DSP
  1295. err = dspqueue_export(queue, &this->queue_id);
  1296. if (err != 0) {
  1297. GGML_LOG_ERROR("ggml-hex: dspqueue_export failed: 0x%08x\n", (unsigned) err);
  1298. throw std::runtime_error("ggml-hex: dspqueue export failed (see log for details)");
  1299. }
  1300. if (opt_etm) {
  1301. err = htp_iface_enable_etm(this->handle);
  1302. if (err != 0) {
  1303. GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
  1304. }
  1305. }
  1306. // Start the DSP-side service. We need to pass the queue ID to the
  1307. // DSP in a FastRPC call; the DSP side will import the queue and start
  1308. // listening for packets in a callback.
  1309. err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx);
  1310. if (err != 0) {
  1311. GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
  1312. throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
  1313. }
  1314. this->valid_iface = true;
  1315. }
  1316. void ggml_hexagon_session::release() noexcept(true) {
  1317. GGML_LOG_INFO("ggml-hex: releasing session: %s\n", this->name.c_str());
  1318. int err;
  1319. // Stop the DSP-side service and close the queue
  1320. if (this->valid_iface) {
  1321. err = htp_iface_stop(this->handle);
  1322. if (err != 0) {
  1323. GGML_ABORT("ggml-hex: htp_iface_stop failed: 0x%08x\n", (unsigned) err);
  1324. }
  1325. }
  1326. if (opt_etm) {
  1327. err = htp_iface_disable_etm(this->handle);
  1328. if (err != 0) {
  1329. GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
  1330. }
  1331. }
  1332. if (this->valid_queue) {
  1333. err = dspqueue_close(queue);
  1334. if (err != 0) {
  1335. GGML_ABORT("ggml-hex: dspqueue_close failed: 0x%08x\n", (unsigned) err);
  1336. }
  1337. }
  1338. if (this->valid_handle) {
  1339. htp_iface_close(this->handle);
  1340. }
  1341. }
  1342. ggml_hexagon_session::ggml_hexagon_session(int dev_id) noexcept(false) {
  1343. buffer_type.context = nullptr;
  1344. repack_buffer_type.context = nullptr;
  1345. try {
  1346. allocate(dev_id);
  1347. buffer_type.iface = ggml_backend_hexagon_buffer_type_interface;
  1348. buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name, this);
  1349. repack_buffer_type.iface = ggml_backend_hexagon_repack_buffer_type_interface;
  1350. repack_buffer_type.context = new ggml_backend_hexagon_buffer_type_context(this->name + "-REPACK", this);
  1351. } catch (std::exception const &exc) {
  1352. release();
  1353. throw;
  1354. }
  1355. }
  1356. ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) {
  1357. release();
  1358. delete static_cast<ggml_backend_hexagon_buffer_type_context*>(buffer_type.context);
  1359. delete static_cast<ggml_backend_hexagon_buffer_type_context*>(repack_buffer_type.context);
  1360. }
  1361. // ** backend interface
  1362. static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b) {
  1363. return b->buft->iface.get_alignment == ggml_backend_hexagon_buffer_type_get_alignment;
  1364. }
  1365. static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) {
  1366. return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer;
  1367. }
  1368. static bool hex_supported_dims2(const struct ggml_tensor * x, const struct ggml_tensor * y) {
  1369. if (x->ne[0] != y->ne[0]) {
  1370. return false;
  1371. }
  1372. if (x->ne[1] != y->ne[1]) {
  1373. return false;
  1374. }
  1375. if (x->ne[2] != y->ne[2]) {
  1376. return false;
  1377. }
  1378. if (x->ne[3] != y->ne[3]) {
  1379. return false;
  1380. }
  1381. return true;
  1382. }
  1383. static bool hex_supported_src0_type(ggml_type t) {
  1384. return t == GGML_TYPE_F32;
  1385. }
  1386. static bool hex_supported_src1_type(ggml_type t) {
  1387. return t == GGML_TYPE_F32;
  1388. }
  1389. static bool hex_supported_src2_type(ggml_type t) {
  1390. return t == GGML_TYPE_F32;
  1391. }
  1392. static bool hex_supported_src1_type2(ggml_type t) {
  1393. return t == GGML_TYPE_F16;
  1394. }
  1395. static bool hex_supported_src1_type3(ggml_type t) {
  1396. return t == GGML_TYPE_I32;
  1397. }
  1398. static bool hex_supported_dst_type(ggml_type t) {
  1399. return t == GGML_TYPE_F32;
  1400. }
  1401. static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_tensor * y) {
  1402. // TODO: support broadcast for ne[2 and 3]
  1403. if (x->ne[0] != y->ne[0]) {
  1404. return false;
  1405. }
  1406. if (x->ne[2] != y->ne[2]) {
  1407. return false;
  1408. }
  1409. if (x->ne[3] != y->ne[3]) {
  1410. return false;
  1411. }
  1412. return true;
  1413. }
  1414. static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
  1415. const struct ggml_tensor * src0 = dst->src[0];
  1416. const struct ggml_tensor * src1 = dst->src[1];
  1417. if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
  1418. return false;
  1419. }
  1420. // TODO: add support for non-cont tensors
  1421. if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
  1422. return false;
  1423. }
  1424. switch (src0->type) {
  1425. case GGML_TYPE_Q4_0:
  1426. case GGML_TYPE_Q8_0:
  1427. case GGML_TYPE_MXFP4:
  1428. if (src0->ne[0] % 32) {
  1429. return false;
  1430. }
  1431. if (src0->ne[1] > 16 * 1024) {
  1432. return false; // typically the lm-head which would be too large for VTCM
  1433. }
  1434. // if ((src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3])) return false;
  1435. if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
  1436. return false;
  1437. }
  1438. // src0 (weights) must be repacked
  1439. if (src0->buffer && !ggml_backend_buffer_is_hexagon_repack(src0->buffer)) {
  1440. return false;
  1441. }
  1442. break;
  1443. case GGML_TYPE_F16:
  1444. if (!opt_experimental) {
  1445. return false;
  1446. }
  1447. break;
  1448. default:
  1449. return false;
  1450. }
  1451. // src0 & src1 & dst must be mapped to the same session
  1452. if (src0->buffer &&
  1453. (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
  1454. return false;
  1455. }
  1456. if (src1->buffer &&
  1457. (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
  1458. return false;
  1459. }
  1460. if (dst->buffer &&
  1461. (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
  1462. return false;
  1463. }
  1464. return true;
  1465. }
  1466. static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
  1467. const struct ggml_tensor * src0 = op->src[0];
  1468. const struct ggml_tensor * src1 = op->src[1];
  1469. const struct ggml_tensor * src2 = op->src[2];
  1470. const struct ggml_tensor * dst = op;
  1471. if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32 || src2->type != GGML_TYPE_I32) {
  1472. return false;
  1473. }
  1474. switch (src0->type) {
  1475. case GGML_TYPE_Q4_0:
  1476. case GGML_TYPE_Q8_0:
  1477. case GGML_TYPE_MXFP4:
  1478. if ((src0->ne[0] % 32)) {
  1479. return false;
  1480. }
  1481. // src0 (weights) must be repacked
  1482. if (src0->buffer && !ggml_backend_buffer_is_hexagon_repack(src0->buffer)) {
  1483. return false;
  1484. }
  1485. break;
  1486. case GGML_TYPE_F16:
  1487. if (!opt_experimental) {
  1488. return false;
  1489. }
  1490. break;
  1491. default:
  1492. return false;
  1493. }
  1494. // TODO: add support for non-cont tensors
  1495. if (!ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
  1496. return false;
  1497. }
  1498. // src0 (weights) must be repacked and mapped to the same session
  1499. // src1 & sr2 & dst must be mapped to the same session
  1500. if (src0->buffer &&
  1501. (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
  1502. return false;
  1503. }
  1504. if (src1->buffer &&
  1505. (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
  1506. return false;
  1507. }
  1508. if (src2->buffer &&
  1509. (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
  1510. return false;
  1511. }
  1512. if (dst->buffer &&
  1513. (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
  1514. return false;
  1515. }
  1516. return true;
  1517. }
  1518. static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
  1519. const struct ggml_tensor * src0 = op->src[0];
  1520. const struct ggml_tensor * src1 = op->src[1];
  1521. const struct ggml_tensor * dst = op;
  1522. if (!hex_supported_src0_type(src0->type)) {
  1523. return false;
  1524. }
  1525. if (!hex_supported_src1_type(src1->type)) {
  1526. return false;
  1527. }
  1528. if (!hex_supported_dst_type(dst->type)) {
  1529. return false;
  1530. }
  1531. if (!hex_supported_dims2(src0, dst)) {
  1532. return false;
  1533. }
  1534. if (!ggml_can_repeat(src1, src0)) {
  1535. return false;
  1536. }
  1537. // TODO: add support for non-contigiuos tensors
  1538. if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
  1539. return false;
  1540. }
  1541. // src0, src1 & dst must be mapped to the same session
  1542. if (src0->buffer &&
  1543. (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
  1544. return false;
  1545. }
  1546. if (src1->buffer &&
  1547. (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
  1548. return false;
  1549. }
  1550. if (dst->buffer &&
  1551. (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
  1552. return false;
  1553. }
  1554. return true;
  1555. }
  1556. static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
  1557. const struct ggml_tensor * src0 = op->src[0];
  1558. const struct ggml_tensor * src1 = op->src[1];
  1559. const struct ggml_tensor * src2 = op->src[2];
  1560. const struct ggml_tensor * dst = op;
  1561. if (!hex_supported_src0_type(src0->type)) {
  1562. return false;
  1563. }
  1564. if (!hex_supported_src1_type(src1->type)) {
  1565. return false;
  1566. }
  1567. if (!hex_supported_dst_type(dst->type)) {
  1568. return false;
  1569. }
  1570. if (!hex_supported_dims2(src0, dst)) {
  1571. return false;
  1572. }
  1573. // REVISIT: add support for non-contigiuos tensors
  1574. if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
  1575. return false;
  1576. }
  1577. // src0, src1 & dst must be mapped to the same session
  1578. if (src0->buffer &&
  1579. (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
  1580. return false;
  1581. }
  1582. if (src1->buffer &&
  1583. (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
  1584. return false;
  1585. }
  1586. if (src2->buffer &&
  1587. (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
  1588. return false;
  1589. }
  1590. if (dst->buffer &&
  1591. (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
  1592. return false;
  1593. }
  1594. return true;
  1595. }
  1596. static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
  1597. const struct ggml_tensor * src0 = op->src[0];
  1598. const struct ggml_tensor * dst = op;
  1599. if (!hex_supported_src0_type(src0->type)) {
  1600. return false;
  1601. }
  1602. if (!hex_supported_dst_type(dst->type)) {
  1603. return false;
  1604. }
  1605. if (!hex_supported_dims2(src0, dst)) {
  1606. return false;
  1607. }
  1608. // TODO: add support for non-contigiuos tensors
  1609. if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
  1610. return false;
  1611. }
  1612. // src0 & dst must be mapped to the same session
  1613. if (src0->buffer &&
  1614. (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
  1615. return false;
  1616. }
  1617. if (dst->buffer &&
  1618. (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
  1619. return false;
  1620. }
  1621. return true;
  1622. }
  1623. static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session * sess,
  1624. const struct ggml_tensor * op) {
  1625. const struct ggml_tensor * src0 = op->src[0];
  1626. const struct ggml_tensor * src1 = op->src[1];
  1627. const struct ggml_tensor * dst = op;
  1628. if (!hex_supported_src0_type(src0->type)) {
  1629. return false;
  1630. }
  1631. if (!hex_supported_dst_type(dst->type)) {
  1632. return false;
  1633. }
  1634. if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
  1635. return false;
  1636. }
  1637. if (src1) {
  1638. if (!hex_supported_src1_type(src1->type)) {
  1639. return false;
  1640. }
  1641. if (!hex_supported_dims2(src0, src1)) {
  1642. return false;
  1643. }
  1644. if (!ggml_is_contiguous(src1)) {
  1645. return false;
  1646. }
  1647. }
  1648. // src0, src1 & dst must be mapped to the same session
  1649. if (src0->buffer &&
  1650. (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
  1651. return false;
  1652. }
  1653. if (src1 && src1->buffer &&
  1654. (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
  1655. return false;
  1656. }
  1657. if (dst->buffer &&
  1658. (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
  1659. return false;
  1660. }
  1661. return true;
  1662. }
  1663. static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
  1664. const struct ggml_tensor * src0 = op->src[0];
  1665. const struct ggml_tensor * src1 = op->src[1];
  1666. const struct ggml_tensor * src2 = op->src[2];
  1667. const struct ggml_tensor * dst = op;
  1668. if (src2) {
  1669. return false; // FIXME: add support for sinks
  1670. }
  1671. if (!hex_supported_src0_type(src0->type)) {
  1672. return false;
  1673. }
  1674. if (!hex_supported_dst_type(dst->type)) {
  1675. return false;
  1676. }
  1677. if (src1) {
  1678. if (!hex_supported_src1_type(src1->type) && !hex_supported_src1_type2(src1->type)) {
  1679. return false;
  1680. }
  1681. if (src0->ne[0] != src1->ne[0]) {
  1682. return false;
  1683. }
  1684. if (src1->ne[1] < src0->ne[1]) {
  1685. return false;
  1686. }
  1687. if (src0->ne[2] % src1->ne[2] != 0) {
  1688. return false;
  1689. }
  1690. if (src0->ne[3] % src1->ne[3] != 0) {
  1691. return false;
  1692. }
  1693. }
  1694. if (src1) {
  1695. if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
  1696. return false;
  1697. }
  1698. } else {
  1699. if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
  1700. return false;
  1701. }
  1702. }
  1703. // src0, src1 & dst must be mapped to the same session
  1704. if (src0->buffer &&
  1705. (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
  1706. return false;
  1707. }
  1708. if (src1 && src1->buffer &&
  1709. (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
  1710. return false;
  1711. }
  1712. if (dst->buffer &&
  1713. (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
  1714. return false;
  1715. }
  1716. return true;
  1717. }
  1718. static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
  1719. const int32_t * op_params = &op->op_params[0];
  1720. int mode = op_params[2];
  1721. if ((mode & GGML_ROPE_TYPE_NEOX) || (mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) {
  1722. return false;
  1723. }
  1724. if (mode & 1) {
  1725. return false;
  1726. }
  1727. const struct ggml_tensor * src0 = op->src[0];
  1728. const struct ggml_tensor * src1 = op->src[1];
  1729. const struct ggml_tensor * src2 = op->src[2];
  1730. const struct ggml_tensor * dst = op;
  1731. if (!hex_supported_src0_type(src0->type)) {
  1732. return false; // FIXME: add support for GGML_TYPE_F16 for src0
  1733. }
  1734. if (!hex_supported_dst_type(dst->type)) {
  1735. return false;
  1736. }
  1737. if (!hex_supported_src1_type3(src1->type)) {
  1738. return false;
  1739. }
  1740. if (src2) {
  1741. if (!hex_supported_src2_type(src2->type)) {
  1742. return false;
  1743. }
  1744. int n_dims = op_params[1];
  1745. if (src2->ne[0] < (n_dims / 2)) {
  1746. return false;
  1747. }
  1748. }
  1749. if (src2) {
  1750. if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(src2) ||
  1751. !ggml_is_contiguous(dst)) {
  1752. return false;
  1753. }
  1754. } else {
  1755. if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
  1756. return false;
  1757. }
  1758. }
  1759. // src0, src1, src2 & dst must be mapped to the same session
  1760. if (src0->buffer &&
  1761. (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
  1762. return false;
  1763. }
  1764. if (src1->buffer &&
  1765. (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
  1766. return false;
  1767. }
  1768. if (src2 && src2->buffer &&
  1769. (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
  1770. return false;
  1771. }
  1772. if (dst->buffer &&
  1773. (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
  1774. return false;
  1775. }
  1776. return true;
  1777. }
  1778. // Init hexagon tensor from GGML tensor and Hexagon buffer
  1779. static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
  1780. h->data = 0; // updated by the receiver
  1781. h->type = t->type;
  1782. h->ne[0] = t->ne[0];
  1783. h->ne[1] = t->ne[1];
  1784. h->ne[2] = t->ne[2];
  1785. h->ne[3] = t->ne[3];
  1786. h->nb[0] = t->nb[0];
  1787. h->nb[1] = t->nb[1];
  1788. h->nb[2] = t->nb[2];
  1789. h->nb[3] = t->nb[3];
  1790. }
  1791. static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
  1792. auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
  1793. auto sess = buf->sess;
  1794. HEX_VERBOSE("ggml-hex: %s dspqbuf : %s base-addr %p base-size %zu data %p offset %u size %u\n", sess->name.c_str(),
  1795. t->name, (void *) buf->base, buf->size, (void *) d->ptr, (unsigned int) d->offset,
  1796. (unsigned int) d->size);
  1797. }
  1798. static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) {
  1799. const struct ggml_tensor * src0 = op->src[0];
  1800. const struct ggml_tensor * src1 = op->src[1];
  1801. const struct ggml_tensor * dst = op;
  1802. auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
  1803. auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
  1804. auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
  1805. uint64_t t1, t2;
  1806. t1 = ggml_time_us();
  1807. // Construct HTP message
  1808. htp_general_req req;
  1809. req.op = HTP_OP_MUL_MAT;
  1810. req.flags = flags;
  1811. init_htp_tensor(&req.src0, src0);
  1812. init_htp_tensor(&req.src1, src1);
  1813. init_htp_tensor(&req.dst, dst);
  1814. // Use opmask to override flags
  1815. if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
  1816. req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
  1817. }
  1818. if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
  1819. req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
  1820. }
  1821. dspqueue_buffer bufs[3];
  1822. memset(bufs, 0, sizeof(bufs));
  1823. // First buffer Weights.
  1824. // The content is static, there is no need to do any cache management
  1825. bufs[0].fd = src0_buf->fd;
  1826. bufs[0].ptr = src0->data;
  1827. bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
  1828. bufs[0].size = ggml_nbytes(src0);
  1829. bufs[0].flags = DSPQUEUE_BUFFER_FLAG_REF;
  1830. // Second buffer Input Activations. This is a buffer that the CPU
  1831. // writes and the DSP reads, so we'll need to flush CPU caches and
  1832. // invalidate DSP ones. On platforms with I/O coherency support the
  1833. // framework will automatically skip cache operations where possible.
  1834. bufs[1].fd = src1_buf->fd;
  1835. bufs[1].ptr = src1->data;
  1836. bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
  1837. bufs[1].size = ggml_nbytes(src1);
  1838. bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  1839. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  1840. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  1841. // Third buffer Output Activations. We'll handle DSP
  1842. // cache maintenance in the response message but need to flush
  1843. // CPU caches to ensure any previously written dirty lines are
  1844. // written out before writes from the DSP start.
  1845. bufs[2].fd = dst_buf->fd;
  1846. bufs[2].ptr = dst->data;
  1847. bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
  1848. bufs[2].size = ggml_nbytes(dst);
  1849. bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
  1850. // Primary DSP session from the src0 (normally weight) tensor
  1851. auto sess = src0_buf->sess;
  1852. if (opt_verbose) {
  1853. char dims[64 * GGML_MAX_SRC];
  1854. char strides[64 * GGML_MAX_SRC];
  1855. char types[16 * GGML_MAX_SRC];
  1856. char buffs[64 * GGML_MAX_SRC];
  1857. char names[64 * GGML_MAX_SRC];
  1858. hex_format_op_dims(dims, op);
  1859. hex_format_op_strides(strides, op);
  1860. hex_format_op_types(types, op);
  1861. hex_format_op_buffs(buffs, op);
  1862. hex_format_op_names(names, op);
  1863. HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
  1864. names, dims, types, strides, buffs, req.flags);
  1865. if (opt_verbose > 1) {
  1866. hex_dump_dspbuf(src0, &bufs[0]);
  1867. hex_dump_dspbuf(src1, &bufs[1]);
  1868. hex_dump_dspbuf(dst, &bufs[2]);
  1869. }
  1870. }
  1871. if ((opt_opmask & HTP_OPMASK_QUEUE)) {
  1872. // Bump pending flag (cleared in the callback once we get the responce)
  1873. sess->op_pending++; // atomic inc
  1874. int err = dspqueue_write(sess->queue,
  1875. 0, // flags - the framework will autoset this
  1876. 3, // number of buffers
  1877. bufs, // buffer references
  1878. sizeof(req),
  1879. (const uint8_t *) &req, // Message
  1880. 1000000 // Timeout
  1881. );
  1882. if (err != 0) {
  1883. GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
  1884. }
  1885. }
  1886. if (opt_opsync) {
  1887. while (sess->op_pending) {
  1888. ;
  1889. }
  1890. }
  1891. t2 = ggml_time_us();
  1892. HEX_PROFILE(
  1893. "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
  1894. "call-usec %llu\n",
  1895. sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
  1896. (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
  1897. (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
  1898. (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
  1899. (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
  1900. }
  1901. static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flags) {
  1902. const struct ggml_tensor * src0 = op->src[0];
  1903. const struct ggml_tensor * src1 = op->src[1];
  1904. const struct ggml_tensor * src2 = op->src[2];
  1905. const struct ggml_tensor * dst = op;
  1906. auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
  1907. auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
  1908. auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
  1909. auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
  1910. uint64_t t1, t2;
  1911. t1 = ggml_time_us();
  1912. // Construct HTP message
  1913. htp_general_req req;
  1914. req.op = HTP_OP_MUL_MAT_ID;
  1915. req.flags = flags;
  1916. init_htp_tensor(&req.src0, src0);
  1917. init_htp_tensor(&req.src1, src1);
  1918. init_htp_tensor(&req.src2, src2);
  1919. init_htp_tensor(&req.dst, dst);
  1920. // Use opmask to override flags
  1921. if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
  1922. req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
  1923. }
  1924. if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
  1925. req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
  1926. }
  1927. dspqueue_buffer bufs[4];
  1928. memset(bufs, 0, sizeof(bufs));
  1929. // First buffer Weights.
  1930. // The content is static, there is no need to do any cache management
  1931. bufs[0].fd = src0_buf->fd;
  1932. bufs[0].ptr = src0->data;
  1933. bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
  1934. bufs[0].size = ggml_nbytes(src0);
  1935. bufs[0].flags = DSPQUEUE_BUFFER_FLAG_REF;
  1936. // Second buffer Input Activations. This is a buffer that the CPU
  1937. // writes and the DSP reads, so we'll need to flush CPU caches and
  1938. // invalidate DSP ones. On platforms with I/O coherency support the
  1939. // framework will automatically skip cache operations where possible.
  1940. bufs[1].fd = src1_buf->fd;
  1941. bufs[1].ptr = src1->data;
  1942. bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
  1943. bufs[1].size = ggml_nbytes(src1);
  1944. bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  1945. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  1946. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  1947. // Third buffer expert IDs. This is a buffer that the CPU
  1948. // writes and the DSP reads, so we'll need to flush CPU caches and
  1949. // invalidate DSP ones. On platforms with I/O coherency support the
  1950. // framework will automatically skip cache operations where possible.
  1951. bufs[2].fd = src2_buf->fd;
  1952. bufs[2].ptr = src2->data;
  1953. bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
  1954. bufs[2].size = ggml_nbytes(src2);
  1955. bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  1956. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  1957. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  1958. // Forth buffer Output Activations. We'll handle DSP
  1959. // cache maintenance in the response message but need to flush
  1960. // CPU caches to ensure any previously written dirty lines are
  1961. // written out before writes from the DSP start.
  1962. bufs[3].fd = dst_buf->fd;
  1963. bufs[3].ptr = dst->data;
  1964. bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
  1965. bufs[3].size = ggml_nbytes(dst);
  1966. bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
  1967. // Primary DSP session from the src0 (normally weight) tensor
  1968. auto sess = src0_buf->sess;
  1969. if (opt_verbose) {
  1970. char dims[64 * GGML_MAX_SRC];
  1971. char strides[64 * GGML_MAX_SRC];
  1972. char types[16 * GGML_MAX_SRC];
  1973. char buffs[64 * GGML_MAX_SRC];
  1974. char names[64 * GGML_MAX_SRC];
  1975. hex_format_op_dims(dims, op);
  1976. hex_format_op_types(types, op);
  1977. hex_format_op_buffs(buffs, op);
  1978. hex_format_op_names(names, op);
  1979. HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
  1980. names, dims, types, strides, buffs, req.flags);
  1981. if (opt_verbose > 1) {
  1982. hex_dump_dspbuf(src0, &bufs[0]);
  1983. hex_dump_dspbuf(src1, &bufs[1]);
  1984. hex_dump_dspbuf(src2, &bufs[2]);
  1985. hex_dump_dspbuf(dst, &bufs[3]);
  1986. }
  1987. }
  1988. if ((opt_opmask & HTP_OPMASK_QUEUE)) {
  1989. // Bump pending flag (cleared in the callback once we get the responce)
  1990. sess->op_pending++; // atomic inc
  1991. int err = dspqueue_write(sess->queue,
  1992. 0, // flags - the framework will autoset this
  1993. 4, // number of buffers
  1994. bufs, // buffer references
  1995. sizeof(req),
  1996. (const uint8_t *) &req, // Message
  1997. 1000000 // Timeout
  1998. );
  1999. if (err != 0) {
  2000. GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
  2001. }
  2002. }
  2003. if (opt_opsync) {
  2004. while (sess->op_pending) {
  2005. ;
  2006. }
  2007. }
  2008. t2 = ggml_time_us();
  2009. HEX_PROFILE(
  2010. "ggml-hex: %s matmul-id %s %u:%u:%u:%u x %s %u:%u:%u:%u (%s %u:%u:%u:%u) -> %s %u:%u:%u:%u : op-usec %u "
  2011. "op-cycles %u op-pkts %u (%f) call-usec %llu\n",
  2012. sess->name.c_str(), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2],
  2013. (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2],
  2014. (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1], (uint32_t) src2->ne[2],
  2015. (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],
  2016. (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
  2017. (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
  2018. }
  2019. static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
  2020. const struct ggml_tensor * node = op;
  2021. const struct ggml_tensor * src0 = node->src[0];
  2022. const struct ggml_tensor * src1 = node->src[1];
  2023. const struct ggml_tensor * dst = node;
  2024. auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
  2025. auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
  2026. auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
  2027. uint64_t t1 = 0;
  2028. uint64_t t2 = 0;
  2029. t1 = ggml_time_us();
  2030. // Construct HTP message
  2031. htp_general_req req;
  2032. req.flags = flags;
  2033. // Use opmask to override flags
  2034. if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
  2035. req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
  2036. }
  2037. if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
  2038. req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
  2039. }
  2040. switch (node->op) {
  2041. case GGML_OP_MUL:
  2042. req.op = HTP_OP_MUL;
  2043. break;
  2044. case GGML_OP_ADD:
  2045. req.op = HTP_OP_ADD;
  2046. break;
  2047. case GGML_OP_SUB:
  2048. req.op = HTP_OP_SUB;
  2049. break;
  2050. default:
  2051. GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
  2052. }
  2053. init_htp_tensor(&req.src0, src0);
  2054. init_htp_tensor(&req.src1, src1);
  2055. init_htp_tensor(&req.dst, dst);
  2056. dspqueue_buffer bufs[3];
  2057. memset(bufs, 0, sizeof(bufs));
  2058. // First buffer = First Operand of Binary op
  2059. // This is a buffer that the CPU writes and the DSP reads, so we'll
  2060. // need to flush CPU caches and invalidate DSP ones. On platforms
  2061. // with I/O coherency support the framework will automatically skip
  2062. // cache operations where possible.
  2063. bufs[0].fd = src0_buf->fd;
  2064. bufs[0].ptr = src0->data;
  2065. bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
  2066. bufs[0].size = ggml_nbytes(src0);
  2067. bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2068. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2069. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
  2070. // Second buffer = Second Operand of Binary op
  2071. // This is a buffer that the CPU writes and the DSP reads, so we'll
  2072. // need to flush CPU caches and invalidate DSP ones. On platforms
  2073. // with I/O coherency support the framework will automatically skip
  2074. // cache operations where possible.
  2075. bufs[1].fd = src1_buf->fd;
  2076. bufs[1].ptr = src1->data;
  2077. bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
  2078. bufs[1].size = ggml_nbytes(src1);
  2079. bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2080. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2081. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  2082. // Third buffer = Output Activations. We'll handle DSP
  2083. // cache maintenance in the response message but need to flush
  2084. // CPU caches to ensure any previously written dirty lines are
  2085. // written out before writes from the DSP start.
  2086. bufs[2].fd = dst_buf->fd;
  2087. bufs[2].ptr = dst->data;
  2088. bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
  2089. bufs[2].size = ggml_nbytes(dst);
  2090. bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
  2091. // Primary DSP session from the src0 tensor
  2092. ggml_hexagon_session * sess = src0_buf->sess;
  2093. if (opt_verbose) {
  2094. char dims[64 * GGML_MAX_SRC];
  2095. char strides[16 * GGML_MAX_SRC];
  2096. char types[16 * GGML_MAX_SRC];
  2097. char buffs[64 * GGML_MAX_SRC];
  2098. char names[64 * GGML_MAX_SRC];
  2099. hex_format_op_dims(dims, op);
  2100. hex_format_op_strides(strides, op);
  2101. hex_format_op_types(types, op);
  2102. hex_format_op_buffs(buffs, op);
  2103. hex_format_op_names(names, op);
  2104. HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
  2105. ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
  2106. if (opt_verbose > 1) {
  2107. hex_dump_dspbuf(src0, &bufs[0]);
  2108. hex_dump_dspbuf(src1, &bufs[1]);
  2109. hex_dump_dspbuf(dst, &bufs[2]);
  2110. }
  2111. }
  2112. if ((opt_opmask & HTP_OPMASK_QUEUE)) {
  2113. // Bump pending flag (cleared in the callback once we get the responce)
  2114. sess->op_pending++; // atomic inc
  2115. int err = dspqueue_write(sess->queue,
  2116. 0, // flags - the framework will autoset this
  2117. 3, // number of buffers
  2118. bufs, // buffer references
  2119. sizeof(req),
  2120. (const uint8_t *) &req, // Message
  2121. 1000000); // Timeout
  2122. if (0 != err) {
  2123. GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
  2124. }
  2125. }
  2126. if (opt_opsync) {
  2127. while (sess->op_pending) {
  2128. ;
  2129. }
  2130. }
  2131. t2 = ggml_time_us();
  2132. HEX_PROFILE(
  2133. "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
  2134. "call-usec %llu\n",
  2135. sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
  2136. (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
  2137. (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
  2138. (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
  2139. (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
  2140. }
  2141. static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
  2142. const struct ggml_tensor * node = op;
  2143. const struct ggml_tensor * src0 = node->src[0];
  2144. const struct ggml_tensor * src1 = node->src[1];
  2145. const struct ggml_tensor * src2 = node->src[2];
  2146. const struct ggml_tensor * dst = node;
  2147. auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
  2148. auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
  2149. auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
  2150. auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
  2151. uint64_t t1 = 0;
  2152. uint64_t t2 = 0;
  2153. t1 = ggml_time_us();
  2154. // Construct HTP message
  2155. htp_general_req req;
  2156. req.flags = flags;
  2157. // Use opmask to override flags
  2158. if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
  2159. req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
  2160. }
  2161. if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
  2162. req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
  2163. }
  2164. switch (node->op) {
  2165. case GGML_OP_ADD_ID:
  2166. req.op = HTP_OP_ADD_ID;
  2167. break;
  2168. default:
  2169. GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
  2170. }
  2171. init_htp_tensor(&req.src0, src0);
  2172. init_htp_tensor(&req.src1, src1);
  2173. init_htp_tensor(&req.src2, src2);
  2174. init_htp_tensor(&req.dst, dst);
  2175. dspqueue_buffer bufs[4];
  2176. memset(bufs, 0, sizeof(bufs));
  2177. // First buffer = input activations
  2178. bufs[0].fd = src0_buf->fd;
  2179. bufs[0].ptr = src0->data;
  2180. bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
  2181. bufs[0].size = ggml_nbytes(src0);
  2182. bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2183. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2184. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
  2185. // Second buffer = experts bias
  2186. bufs[1].fd = src1_buf->fd;
  2187. bufs[1].ptr = src1->data;
  2188. bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
  2189. bufs[1].size = ggml_nbytes(src1);
  2190. bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2191. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2192. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  2193. // Third buffer = activated experts
  2194. bufs[2].fd = src2_buf->fd;
  2195. bufs[2].ptr = src2->data;
  2196. bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
  2197. bufs[2].size = ggml_nbytes(src2);
  2198. bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2199. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2200. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  2201. // Forth buffer = output activations
  2202. bufs[3].fd = dst_buf->fd;
  2203. bufs[3].ptr = dst->data;
  2204. bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
  2205. bufs[3].size = ggml_nbytes(dst);
  2206. bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
  2207. // Primary DSP session from the src0 tensor
  2208. ggml_hexagon_session * sess = src0_buf->sess;
  2209. if (opt_verbose) {
  2210. char dims[64 * GGML_MAX_SRC];
  2211. char strides[16 * GGML_MAX_SRC];
  2212. char types[16 * GGML_MAX_SRC];
  2213. char buffs[64 * GGML_MAX_SRC];
  2214. char names[64 * GGML_MAX_SRC];
  2215. hex_format_op_dims(dims, op);
  2216. hex_format_op_strides(strides, op);
  2217. hex_format_op_types(types, op);
  2218. hex_format_op_buffs(buffs, op);
  2219. hex_format_op_names(names, op);
  2220. HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
  2221. ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
  2222. if (opt_verbose > 1) {
  2223. hex_dump_dspbuf(src0, &bufs[0]);
  2224. hex_dump_dspbuf(src1, &bufs[1]);
  2225. hex_dump_dspbuf(src2, &bufs[2]);
  2226. hex_dump_dspbuf(dst, &bufs[3]);
  2227. }
  2228. }
  2229. if ((opt_opmask & HTP_OPMASK_QUEUE)) {
  2230. // Bump pending flag (cleared in the callback once we get the responce)
  2231. sess->op_pending++; // atomic inc
  2232. int err = dspqueue_write(sess->queue,
  2233. 0, // flags - the framework will autoset this
  2234. 4, // number of buffers
  2235. bufs, // buffer references
  2236. sizeof(req),
  2237. (const uint8_t *) &req, // Message
  2238. 1000000); // Timeout
  2239. if (0 != err) {
  2240. GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
  2241. }
  2242. }
  2243. if (opt_opsync) {
  2244. while (sess->op_pending) {
  2245. ;
  2246. }
  2247. }
  2248. t2 = ggml_time_us();
  2249. HEX_PROFILE(
  2250. "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) "
  2251. "call-usec %llu\n",
  2252. sess->name.c_str(), ggml_op_name(node->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
  2253. (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
  2254. (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
  2255. (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
  2256. (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
  2257. }
  2258. static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
  2259. const struct ggml_tensor * src0 = op->src[0];
  2260. const struct ggml_tensor * src1 = op->src[1];
  2261. const struct ggml_tensor * dst = op;
  2262. uint64_t t1 = 0;
  2263. uint64_t t2 = 0;
  2264. t1 = ggml_time_us();
  2265. // Construct HTP message
  2266. htp_general_req req;
  2267. memset(&req, 0, sizeof(htp_general_req));
  2268. memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
  2269. req.flags = flags;
  2270. bool supported = false;
  2271. switch (op->op) {
  2272. case GGML_OP_RMS_NORM:
  2273. req.op = HTP_OP_RMS_NORM;
  2274. supported = true;
  2275. break;
  2276. case GGML_OP_UNARY:
  2277. if (ggml_get_unary_op(dst) == GGML_UNARY_OP_SILU) {
  2278. req.op = HTP_OP_UNARY_SILU;
  2279. supported = true;
  2280. }
  2281. break;
  2282. case GGML_OP_GLU:
  2283. if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU) {
  2284. req.op = HTP_OP_GLU_SWIGLU;
  2285. supported = true;
  2286. } else if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
  2287. req.op = HTP_OP_GLU_SWIGLU_OAI;
  2288. supported = true;
  2289. }
  2290. break;
  2291. case GGML_OP_SOFT_MAX:
  2292. req.op = HTP_OP_SOFTMAX;
  2293. supported = true;
  2294. default:
  2295. break;
  2296. }
  2297. if (!supported) {
  2298. GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
  2299. }
  2300. init_htp_tensor(&req.dst, dst);
  2301. init_htp_tensor(&req.src0, src0);
  2302. if (src1) {
  2303. init_htp_tensor(&req.src1, src1);
  2304. }
  2305. // Use opmask to override flags
  2306. if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
  2307. req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
  2308. }
  2309. if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
  2310. req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
  2311. }
  2312. dspqueue_buffer bufs[3];
  2313. int n_bufs = 0;
  2314. memset(bufs, 0, sizeof(bufs));
  2315. // First buffer = Only Operand of Unary op
  2316. // This is a buffer that the CPU writes and the DSP reads, so we'll
  2317. // need to flush CPU caches and invalidate DSP ones. On platforms
  2318. // with I/O coherency support the framework will automatically skip
  2319. // cache operations where possible.
  2320. auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
  2321. bufs[n_bufs].fd = src0_buf->fd;
  2322. bufs[n_bufs].ptr = src0->data;
  2323. bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
  2324. bufs[n_bufs].size = ggml_nbytes(src0);
  2325. bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2326. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2327. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
  2328. ++n_bufs;
  2329. if (src1) {
  2330. // Second buffer = Second Operand of Binary op
  2331. // This is a buffer that the CPU writes and the DSP reads, so we'll
  2332. // need to flush CPU caches and invalidate DSP ones. On platforms
  2333. // with I/O coherency support the framework will automatically skip
  2334. // cache operations where possible.
  2335. auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
  2336. bufs[n_bufs].fd = src1_buf->fd;
  2337. bufs[n_bufs].ptr = src1->data;
  2338. bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
  2339. bufs[n_bufs].size = ggml_nbytes(src1);
  2340. bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2341. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2342. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  2343. ++n_bufs;
  2344. }
  2345. // Second or third buffer = Output Activations. We'll handle DSP
  2346. // Second buffer = Output Activations. We'll handle DSP
  2347. // cache maintenance in the response message but need to flush
  2348. // CPU caches to ensure any previously written dirty lines are
  2349. // written out before writes from the DSP start.
  2350. auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
  2351. bufs[n_bufs].fd = dst_buf->fd;
  2352. bufs[n_bufs].ptr = dst->data;
  2353. bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
  2354. bufs[n_bufs].size = ggml_nbytes(dst);
  2355. bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
  2356. ++n_bufs;
  2357. // Primary DSP session from the src0 tensor
  2358. ggml_hexagon_session * sess = src0_buf->sess;
  2359. if (opt_verbose) {
  2360. char dims[64 * GGML_MAX_SRC];
  2361. char strides[64 * GGML_MAX_SRC];
  2362. char types[16 * GGML_MAX_SRC];
  2363. char buffs[64 * GGML_MAX_SRC];
  2364. char names[64 * GGML_MAX_SRC];
  2365. hex_format_op_dims(dims, op);
  2366. hex_format_op_strides(strides, op);
  2367. hex_format_op_types(types, op);
  2368. hex_format_op_buffs(buffs, op);
  2369. hex_format_op_names(names, op);
  2370. HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
  2371. names, dims, types, strides, buffs, req.flags);
  2372. if (opt_verbose > 1) {
  2373. hex_dump_dspbuf(src0, &bufs[0]);
  2374. if (src1) {
  2375. hex_dump_dspbuf(src1, &bufs[1]);
  2376. hex_dump_dspbuf(dst, &bufs[2]);
  2377. } else {
  2378. hex_dump_dspbuf(dst, &bufs[1]);
  2379. }
  2380. }
  2381. }
  2382. if ((opt_opmask & HTP_OPMASK_QUEUE)) {
  2383. // Bump pending flag (cleared in the callback once we get the responce)
  2384. sess->op_pending++; // atomic inc
  2385. int err = dspqueue_write(sess->queue,
  2386. 0, // flags - the framework will autoset this
  2387. n_bufs, // number of buffers
  2388. bufs, // buffer references
  2389. sizeof(req),
  2390. (const uint8_t *) &req, // Message
  2391. 1000000); // Timeout
  2392. if (0 != err) {
  2393. GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
  2394. }
  2395. }
  2396. if (opt_opsync) {
  2397. while (sess->op_pending) {
  2398. ;
  2399. }
  2400. }
  2401. t2 = ggml_time_us();
  2402. if (src1) {
  2403. HEX_PROFILE(
  2404. "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
  2405. "(%f) call-usec %llu\n",
  2406. sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
  2407. (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
  2408. (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
  2409. (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
  2410. (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
  2411. } else {
  2412. HEX_PROFILE(
  2413. "ggml-hex: %s %s %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u (%f) call-usec "
  2414. "%llu\n",
  2415. sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
  2416. (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
  2417. (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
  2418. (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
  2419. }
  2420. }
  2421. static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
  2422. const struct ggml_tensor * src0 = op->src[0];
  2423. const struct ggml_tensor * src1 = op->src[1];
  2424. const struct ggml_tensor * src2 = op->src[2];
  2425. const struct ggml_tensor * dst = op;
  2426. uint64_t t1 = 0;
  2427. uint64_t t2 = 0;
  2428. t1 = ggml_time_us();
  2429. // Construct HTP message
  2430. htp_general_req req;
  2431. memset(&req, 0, sizeof(htp_general_req));
  2432. memcpy(&req.op_params, &op->op_params, sizeof(op->op_params));
  2433. req.flags = flags;
  2434. req.op = HTP_OP_ROPE;
  2435. init_htp_tensor(&req.dst, dst);
  2436. init_htp_tensor(&req.src0, src0);
  2437. init_htp_tensor(&req.src1, src1);
  2438. if (src2) {
  2439. init_htp_tensor(&req.src2, src2);
  2440. }
  2441. // Use opmask to override flags
  2442. if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
  2443. req.flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
  2444. }
  2445. if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
  2446. req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
  2447. }
  2448. dspqueue_buffer bufs[4];
  2449. int n_bufs = 0;
  2450. memset(bufs, 0, sizeof(bufs));
  2451. // First buffer
  2452. // This is a buffer that the CPU writes and the DSP reads, so we'll
  2453. // need to flush CPU caches and invalidate DSP ones. On platforms
  2454. // with I/O coherency support the framework will automatically skip
  2455. // cache operations where possible.
  2456. auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
  2457. bufs[n_bufs].fd = src0_buf->fd;
  2458. bufs[n_bufs].ptr = src0->data;
  2459. bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
  2460. bufs[n_bufs].size = ggml_nbytes(src0);
  2461. bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2462. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2463. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
  2464. ++n_bufs;
  2465. // Second buffer
  2466. // This is a buffer that the CPU writes and the DSP reads, so we'll
  2467. // need to flush CPU caches and invalidate DSP ones. On platforms
  2468. // with I/O coherency support the framework will automatically skip
  2469. // cache operations where possible.
  2470. auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
  2471. bufs[n_bufs].fd = src1_buf->fd;
  2472. bufs[n_bufs].ptr = src1->data;
  2473. bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
  2474. bufs[n_bufs].size = ggml_nbytes(src1);
  2475. bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2476. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2477. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  2478. ++n_bufs;
  2479. if (src2) {
  2480. // Third buffer
  2481. // This is a buffer that the CPU writes and the DSP reads, so we'll
  2482. // need to flush CPU caches and invalidate DSP ones. On platforms
  2483. // with I/O coherency support the framework will automatically skip
  2484. // cache operations where possible.
  2485. auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
  2486. bufs[n_bufs].fd = src2_buf->fd;
  2487. bufs[n_bufs].ptr = src2->data;
  2488. bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
  2489. bufs[n_bufs].size = ggml_nbytes(src2);
  2490. bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
  2491. DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
  2492. DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
  2493. ++n_bufs;
  2494. }
  2495. // Final buffer = Output Activations. We'll handle DSP
  2496. // Second buffer = Output Activations. We'll handle DSP
  2497. // cache maintenance in the response message but need to flush
  2498. // CPU caches to ensure any previously written dirty lines are
  2499. // written out before writes from the DSP start.
  2500. auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
  2501. bufs[n_bufs].fd = dst_buf->fd;
  2502. bufs[n_bufs].ptr = dst->data;
  2503. bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
  2504. bufs[n_bufs].size = ggml_nbytes(dst);
  2505. bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
  2506. ++n_bufs;
  2507. // Primary DSP session from the src0 tensor
  2508. ggml_hexagon_session * sess = src0_buf->sess;
  2509. if (opt_verbose) {
  2510. char dims[64 * GGML_MAX_SRC];
  2511. char strides[64 * GGML_MAX_SRC];
  2512. char types[16 * GGML_MAX_SRC];
  2513. char buffs[64 * GGML_MAX_SRC];
  2514. char names[64 * GGML_MAX_SRC];
  2515. hex_format_op_dims(dims, op);
  2516. hex_format_op_strides(strides, op);
  2517. hex_format_op_types(types, op);
  2518. hex_format_op_buffs(buffs, op);
  2519. hex_format_op_names(names, op);
  2520. HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
  2521. names, dims, types, strides, buffs, req.flags);
  2522. if (opt_verbose > 1) {
  2523. hex_dump_dspbuf(src0, &bufs[0]);
  2524. if (src1) {
  2525. hex_dump_dspbuf(src1, &bufs[1]);
  2526. hex_dump_dspbuf(dst, &bufs[2]);
  2527. } else {
  2528. hex_dump_dspbuf(dst, &bufs[1]);
  2529. }
  2530. }
  2531. }
  2532. if ((opt_opmask & HTP_OPMASK_QUEUE)) {
  2533. // Bump pending flag (cleared in the callback once we get the responce)
  2534. sess->op_pending++; // atomic inc
  2535. int err = dspqueue_write(sess->queue,
  2536. 0, // flags - the framework will autoset this
  2537. n_bufs, // number of buffers
  2538. bufs, // buffer references
  2539. sizeof(req),
  2540. (const uint8_t *) &req, // Message
  2541. 1000000); // Timeout
  2542. if (0 != err) {
  2543. GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
  2544. }
  2545. }
  2546. if (opt_opsync) {
  2547. while (sess->op_pending) {
  2548. ;
  2549. }
  2550. }
  2551. t2 = ggml_time_us();
  2552. if (src2) {
  2553. HEX_PROFILE(
  2554. "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles "
  2555. "%u op-pkts %u (%f) call-usec %llu\n",
  2556. sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
  2557. (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
  2558. (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], src2->name, (uint32_t) src2->ne[0], (uint32_t) src2->ne[1],
  2559. (uint32_t) src2->ne[2], (uint32_t) src2->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
  2560. (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
  2561. (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
  2562. } else {
  2563. HEX_PROFILE(
  2564. "ggml-hex: %s %s %s %u:%u:%u:%u x %s %u:%u:%u:%u -> %s %u:%u:%u:%u : op-usec %u op-cycles %u op-pkts %u "
  2565. "(%f) call-usec %llu\n",
  2566. sess->name.c_str(), ggml_op_name(op->op), src0->name, (uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
  2567. (uint32_t) src0->ne[2], (uint32_t) src0->ne[3], src1->name, (uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
  2568. (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], dst->name, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
  2569. (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], sess->prof_usecs, sess->prof_cycles, sess->prof_pkts,
  2570. (float) sess->prof_cycles / sess->prof_pkts, (unsigned long long) t2 - t1);
  2571. }
  2572. }
  2573. static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
  2574. auto sess = static_cast<ggml_hexagon_session *>(backend->context);
  2575. return sess->name.c_str();
  2576. }
  2577. static void ggml_backend_hexagon_free(ggml_backend_t backend) {
  2578. // we just need to delete the backend here
  2579. // the sessions are allocated & freed as part of the registry
  2580. delete backend;
  2581. }
  2582. static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
  2583. return (op0 && op0->src[1] == op1->src[1]);
  2584. }
  2585. // scan the graph and figure out last compute op index
  2586. static inline int last_compute_op(ggml_cgraph * graph) {
  2587. int last;
  2588. for (int i = 0; i < graph->n_nodes; ++i) {
  2589. ggml_tensor * node = graph->nodes[i];
  2590. switch (node->op) {
  2591. case GGML_OP_MUL_MAT:
  2592. case GGML_OP_MUL_MAT_ID:
  2593. case GGML_OP_MUL:
  2594. case GGML_OP_ADD:
  2595. case GGML_OP_SUB:
  2596. case GGML_OP_RMS_NORM:
  2597. case GGML_OP_GLU:
  2598. case GGML_OP_ADD_ID:
  2599. last = i;
  2600. break;
  2601. default:
  2602. break;
  2603. }
  2604. }
  2605. return last;
  2606. }
  2607. static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
  2608. auto sess = static_cast<ggml_hexagon_session *>(backend->context);
  2609. HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->name.c_str(), graph->n_nodes);
  2610. const int last = last_compute_op(graph);
  2611. const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer
  2612. for (int i = 0; i < graph->n_nodes; ++i) {
  2613. ggml_tensor * node = graph->nodes[i];
  2614. uint32_t flags = 0;
  2615. // skip quantizer if src1 is reused
  2616. if (op_reuse_src1(node, prev_quant_op)) {
  2617. flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
  2618. }
  2619. // ask for early notification for the last Op
  2620. if (i == last) {
  2621. flags |= HTP_OPFLAGS_EARLY_WAKEUP;
  2622. }
  2623. switch (node->op) {
  2624. case GGML_OP_MUL_MAT:
  2625. ggml_hexagon_mul_mat(node, flags);
  2626. prev_quant_op = node;
  2627. break;
  2628. case GGML_OP_MUL_MAT_ID:
  2629. ggml_hexagon_mul_mat_id(node, flags);
  2630. prev_quant_op = node;
  2631. break;
  2632. case GGML_OP_MUL:
  2633. case GGML_OP_ADD:
  2634. case GGML_OP_SUB:
  2635. ggml_hexagon_binary(node, flags);
  2636. break;
  2637. case GGML_OP_ADD_ID:
  2638. ggml_hexagon_add_id(node, flags);
  2639. break;
  2640. case GGML_OP_RMS_NORM:
  2641. ggml_hexagon_unary(node, flags);
  2642. break;
  2643. case GGML_OP_UNARY:
  2644. if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
  2645. ggml_hexagon_unary(node, flags);
  2646. }
  2647. break;
  2648. case GGML_OP_GLU:
  2649. if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
  2650. (ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI)) {
  2651. ggml_hexagon_unary(node, flags);
  2652. }
  2653. break;
  2654. case GGML_OP_SOFT_MAX:
  2655. ggml_hexagon_unary(node, flags);
  2656. break;
  2657. case GGML_OP_ROPE:
  2658. ggml_hexagon_rope(node, flags);
  2659. break;
  2660. // non-compute ops
  2661. case GGML_OP_NONE:
  2662. case GGML_OP_RESHAPE:
  2663. case GGML_OP_VIEW:
  2664. case GGML_OP_PERMUTE:
  2665. case GGML_OP_TRANSPOSE:
  2666. break;
  2667. default:
  2668. GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
  2669. }
  2670. }
  2671. // Wait until all pending ops complete
  2672. while (sess->op_pending) {
  2673. ;
  2674. }
  2675. return GGML_STATUS_SUCCESS;
  2676. }
  2677. static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
  2678. auto sess = static_cast<ggml_hexagon_session *>(backend->context);
  2679. HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
  2680. // Wait until all pending ops complete
  2681. while (sess->op_pending) {
  2682. ;
  2683. }
  2684. }
  2685. struct node_info {
  2686. ggml_tensor * node;
  2687. std::vector<ggml_tensor *> fused;
  2688. ggml_op op() const {
  2689. return node->op;
  2690. }
  2691. const ggml_tensor * dst() const {
  2692. return fused.empty() ? node : fused.back();
  2693. }
  2694. const ggml_tensor * src0() const {
  2695. return node->src[0];
  2696. }
  2697. const ggml_tensor * src1() const {
  2698. return node->src[1];
  2699. }
  2700. bool is_empty() const {
  2701. return ggml_op_is_empty(node->op);
  2702. }
  2703. void add_fused(ggml_tensor * t) {
  2704. fused.push_back(t);
  2705. }
  2706. bool stackable() const {
  2707. switch (this->op()) {
  2708. case GGML_OP_MUL_MAT:
  2709. case GGML_OP_MUL_MAT_ID:
  2710. return ggml_is_quantized(this->src0()->type);
  2711. default:
  2712. return false;
  2713. }
  2714. }
  2715. bool same_input(const node_info& n) const {
  2716. return n.src1() == this->src1();
  2717. }
  2718. };
  2719. static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<node_info> & nodes) {
  2720. const int n = nodes.size();
  2721. std::vector<int> res;
  2722. res.reserve(n);
  2723. std::vector<bool> used(n, false);
  2724. // The main goal here is to stack the MUL_MAT ops with the same src1 input.
  2725. // This allows use to reuse dynamically quantized src1 in VTCM.
  2726. // TODO: the current version might do incorrect reodering in cases where quantized src0
  2727. // input is an output of another Op.
  2728. for (int i0 = 0; i0 < n; i0++) {
  2729. if (used[i0]) {
  2730. continue;
  2731. }
  2732. res.push_back(i0);
  2733. const auto & node0 = nodes[i0];
  2734. if (!node0.stackable()) {
  2735. continue;
  2736. }
  2737. // that many nodes forward to search for stackable nodes that can reuse VTCM
  2738. constexpr int N_FORWARD = 8;
  2739. for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
  2740. if (used[i1]) {
  2741. continue;
  2742. }
  2743. const auto & node1 = nodes[i1];
  2744. if (node1.stackable() && node1.same_input(node0)) {
  2745. res.push_back(i1);
  2746. used[i1] = true;
  2747. }
  2748. }
  2749. }
  2750. return res;
  2751. }
  2752. static void ggml_backend_hexagon_graph_optimize(ggml_backend_t backend, ggml_cgraph * gf) {
  2753. const int n = gf->n_nodes;
  2754. constexpr int MAX_FUSE = 16;
  2755. enum ggml_op ops[MAX_FUSE];
  2756. std::vector<node_info> nodes;
  2757. nodes.reserve(gf->n_nodes);
  2758. // fuse nodes:
  2759. // we don't want to make reorders that break fusing, so we first pack all fusable tensors
  2760. // and perform the reorder over the fused nodes. after the reorder is done, we unfuse
  2761. for (int i = 0; i < n; i++) {
  2762. node_info node = {
  2763. /*.node =*/ gf->nodes[i],
  2764. /*.fused =*/ {},
  2765. };
  2766. // fuse only ops that start with these operations
  2767. // can be expanded when needed
  2768. if (node.op() == GGML_OP_ADD ||
  2769. node.op() == GGML_OP_NORM ||
  2770. node.op() == GGML_OP_RMS_NORM) {
  2771. ops[0] = node.op();
  2772. int f = i + 1;
  2773. while (f < n && f < i + MAX_FUSE) {
  2774. // conservatively allow fusing only these ops
  2775. // can be expanded when needed
  2776. if (gf->nodes[f]->op != GGML_OP_ADD &&
  2777. gf->nodes[f]->op != GGML_OP_MUL &&
  2778. gf->nodes[f]->op != GGML_OP_NORM &&
  2779. gf->nodes[f]->op != GGML_OP_RMS_NORM) {
  2780. break;
  2781. }
  2782. ops[f - i] = gf->nodes[f]->op;
  2783. f++;
  2784. }
  2785. f -= i;
  2786. for (; f > 1; f--) {
  2787. if (ggml_can_fuse(gf, i, ops, f)) {
  2788. break;
  2789. }
  2790. }
  2791. // add the fused tensors into the node info so we can unfuse them later
  2792. for (int k = 1; k < f; k++) {
  2793. ++i;
  2794. // the .dst() becomes the last fused tensor
  2795. node.add_fused(gf->nodes[i]);
  2796. }
  2797. }
  2798. nodes.push_back(std::move(node));
  2799. }
  2800. const auto order = ggml_hexagon_graph_optimize_reorder(nodes);
  2801. // unfuse
  2802. {
  2803. int j = 0;
  2804. for (const auto i : order) {
  2805. const auto & node = nodes[i];
  2806. gf->nodes[j++] = node.node;
  2807. for (auto * fused : node.fused) {
  2808. gf->nodes[j++] = fused;
  2809. }
  2810. }
  2811. }
  2812. }
  2813. static struct ggml_backend_i hexagon_backend_i = {
  2814. /* .get_name = */ ggml_backend_hexagon_name,
  2815. /* .free = */ ggml_backend_hexagon_free,
  2816. /* .set_tensor_async = */ NULL,
  2817. /* .get_tensor_async = */ NULL,
  2818. /* .cpy_tensor_async = */ NULL,
  2819. /* .synchronize = */ ggml_backend_hexagon_synchronize,
  2820. /* .graph_plan_create = */ NULL,
  2821. /* .graph_plan_free = */ NULL,
  2822. /* .graph_plan_update = */ NULL,
  2823. /* .graph_plan_compute = */ NULL,
  2824. /* .graph_compute = */ ggml_backend_hexagon_graph_compute,
  2825. /* .event_record = */ NULL,
  2826. /* .event_wait = */ NULL,
  2827. /* .graph_optimize = */ ggml_backend_hexagon_graph_optimize,
  2828. };
  2829. static ggml_guid_t ggml_backend_hexagon_guid() {
  2830. static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49,
  2831. 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 };
  2832. return &guid;
  2833. }
  2834. bool ggml_backend_is_hexagon(ggml_backend_t backend) {
  2835. return backend && backend->iface.get_name == ggml_backend_hexagon_name;
  2836. }
  2837. // device interface
  2838. static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, const char * params) {
  2839. auto sess = static_cast<ggml_hexagon_session *>(dev->context);
  2840. return new ggml_backend{
  2841. /* .guid = */ ggml_backend_hexagon_guid(),
  2842. /* .interface = */ hexagon_backend_i,
  2843. /* .device = */ dev,
  2844. /* .context = */ sess,
  2845. };
  2846. GGML_UNUSED(params);
  2847. }
  2848. static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
  2849. auto sess = static_cast<ggml_hexagon_session *>(dev->context);
  2850. return sess->name.c_str();
  2851. GGML_UNUSED(dev);
  2852. }
  2853. static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) {
  2854. return "Hexagon";
  2855. GGML_UNUSED(dev);
  2856. }
  2857. static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
  2858. // ~2GB per session for now
  2859. *free = 2ULL * 1024 * 1024 * 1024;
  2860. *total = *free;
  2861. GGML_UNUSED(dev);
  2862. }
  2863. static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) {
  2864. return GGML_BACKEND_DEVICE_TYPE_GPU;
  2865. GGML_UNUSED(dev);
  2866. }
  2867. static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
  2868. props->name = ggml_backend_hexagon_device_get_name(dev);
  2869. props->description = ggml_backend_hexagon_device_get_description(dev);
  2870. props->type = ggml_backend_hexagon_device_get_type(dev);
  2871. ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total);
  2872. props->caps = {
  2873. /* .async = */ true,
  2874. /* .host_buffer = */ (bool) opt_hostbuf,
  2875. /* .buffer_from_host_ptr = */ false,
  2876. /* .events = */ false,
  2877. };
  2878. }
  2879. static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) {
  2880. auto sess = static_cast<ggml_hexagon_session *>(dev->context);
  2881. return &sess->buffer_type;
  2882. }
  2883. static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_repack_buffer_type(ggml_backend_dev_t dev) {
  2884. auto sess = static_cast<ggml_hexagon_session *>(dev->context);
  2885. return &sess->repack_buffer_type;
  2886. }
  2887. static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
  2888. auto sess = static_cast<ggml_hexagon_session *>(dev->context);
  2889. bool supp = false;
  2890. switch (op->op) {
  2891. case GGML_OP_NONE:
  2892. case GGML_OP_RESHAPE:
  2893. case GGML_OP_VIEW:
  2894. case GGML_OP_PERMUTE:
  2895. case GGML_OP_TRANSPOSE:
  2896. supp = true;
  2897. break;
  2898. case GGML_OP_MUL_MAT:
  2899. supp = ggml_hexagon_supported_mul_mat(sess, op);
  2900. break;
  2901. case GGML_OP_MUL_MAT_ID:
  2902. supp = ggml_hexagon_supported_mul_mat_id(sess, op);
  2903. break;
  2904. case GGML_OP_MUL:
  2905. case GGML_OP_ADD:
  2906. case GGML_OP_SUB:
  2907. supp = ggml_hexagon_supported_binary(sess, op);
  2908. break;
  2909. case GGML_OP_ADD_ID:
  2910. supp = ggml_hexagon_supported_add_id(sess, op);
  2911. break;
  2912. case GGML_OP_RMS_NORM:
  2913. supp = ggml_hexagon_supported_unary(sess, op);
  2914. break;
  2915. case GGML_OP_SOFT_MAX:
  2916. supp = ggml_hexagon_supported_softmax(sess, op);
  2917. break;
  2918. case GGML_OP_UNARY:
  2919. if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
  2920. supp = ggml_hexagon_supported_activations(sess, op);
  2921. }
  2922. break;
  2923. case GGML_OP_GLU:
  2924. if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) {
  2925. supp = ggml_hexagon_supported_activations(sess, op);
  2926. }
  2927. break;
  2928. case GGML_OP_ROPE:
  2929. supp = ggml_hexagon_supported_rope(sess, op);
  2930. break;
  2931. default:
  2932. break;
  2933. }
  2934. if (opt_verbose) {
  2935. char dims[64 * GGML_MAX_SRC];
  2936. char strides[64 * GGML_MAX_SRC];
  2937. char types[16 * GGML_MAX_SRC];
  2938. char buffs[64 * GGML_MAX_SRC];
  2939. char names[64 * GGML_MAX_SRC];
  2940. hex_format_op_dims(dims, op);
  2941. hex_format_op_strides(strides, op);
  2942. hex_format_op_types(types, op);
  2943. hex_format_op_buffs(buffs, op);
  2944. hex_format_op_names(names, op);
  2945. HEX_VERBOSE("ggml-hex: %s device-supports-op %s : %s : %s : %s : %s : %s : (%d)\n", sess->name.c_str(),
  2946. ggml_op_name(op->op), names, dims, types, strides, buffs, (int) supp);
  2947. }
  2948. return supp;
  2949. GGML_UNUSED(dev);
  2950. }
  2951. static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
  2952. if (buft->iface.get_alignment != ggml_backend_hexagon_buffer_type_get_alignment) {
  2953. return false;
  2954. }
  2955. auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
  2956. auto s1 = static_cast<ggml_backend_hexagon_buffer_type_context *>(buft->context)->sess;
  2957. // Need session/domain-id for buffers to be compatible
  2958. bool supp = (s0->session_id == s1->session_id);
  2959. HEX_VERBOSE("ggml-hex: %s device-supports-buft %s (%d)\n", s0->name.c_str(), s1->name.c_str(), (int) supp);
  2960. return supp;
  2961. }
  2962. static ggml_backend_buffer_type_t * ggml_backend_hexagon_device_get_extra_buffers_type(ggml_backend_dev_t dev) {
  2963. auto s0 = static_cast<ggml_hexagon_session *>(dev->context);
  2964. HEX_VERBOSE("ggml-hex: device-get-extra-buft : %s \n", s0->name.c_str());
  2965. static ggml_backend_buffer_type_t bufts[2];
  2966. bufts[0] = ggml_backend_hexagon_device_get_repack_buffer_type(dev);
  2967. bufts[1] = NULL;
  2968. return bufts;
  2969. }
  2970. static const struct ggml_backend_device_i ggml_backend_hexagon_device_i = {
  2971. /* .get_name = */ ggml_backend_hexagon_device_get_name,
  2972. /* .get_description = */ ggml_backend_hexagon_device_get_description,
  2973. /* .get_memory = */ ggml_backend_hexagon_device_get_memory,
  2974. /* .get_type = */ ggml_backend_hexagon_device_get_type,
  2975. /* .get_props = */ ggml_backend_hexagon_device_get_props,
  2976. /* .init_backend = */ ggml_backend_hexagon_device_init,
  2977. /* .get_buffer_type = */ ggml_backend_hexagon_device_get_buffer_type,
  2978. /* .get_host_buffer_type = */ NULL, // ggml_backend_hexagon_device_get_host_buffer_type,
  2979. /* .buffer_from_host_ptr = */ NULL, // ggml_backend_hexagon_device_buffer_from_ptr,
  2980. /* .supports_op = */ ggml_backend_hexagon_device_supports_op,
  2981. /* .supports_buft = */ ggml_backend_hexagon_device_supports_buft,
  2982. /* .offload_op = */ NULL, // ggml_backend_hexagon_device_offload_op,
  2983. /* .event_new = */ NULL,
  2984. /* .event_free = */ NULL,
  2985. /* .event_synchronize = */ NULL,
  2986. };
  2987. //** backend registry
  2988. #define GGML_HEXAGON_MAX_SESSIONS 16
  2989. struct ggml_hexagon_registry {
  2990. ggml_hexagon_registry(ggml_backend_reg_t reg);
  2991. ~ggml_hexagon_registry();
  2992. ggml_backend_device devices[GGML_HEXAGON_MAX_SESSIONS];
  2993. };
  2994. ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
  2995. GGML_LOG_INFO("ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev %zu\n", opt_ndev);
  2996. if (!opt_arch) {
  2997. int err = get_hex_arch_ver(CDSP_DOMAIN_ID, &opt_arch);
  2998. if (err != 0) {
  2999. GGML_LOG_ERROR("ggml-hex: failed to query HTP version (err %d) defaulting to v73\n", err);
  3000. opt_arch = 73;
  3001. }
  3002. }
  3003. GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
  3004. // Create devices / sessions
  3005. for (size_t i = 0; i < opt_ndev; i++) {
  3006. devices[i].iface = ggml_backend_hexagon_device_i;
  3007. devices[i].reg = reg;
  3008. try {
  3009. devices[i].context = new ggml_hexagon_session(i);
  3010. } catch (std::exception const &exc) {
  3011. GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
  3012. devices[i].context = nullptr;
  3013. }
  3014. }
  3015. }
  3016. ggml_hexagon_registry::~ggml_hexagon_registry() {
  3017. GGML_LOG_INFO("ggml-hex: releasing registry\n");
  3018. // Release devices / sessions
  3019. for (size_t i = 0; i < opt_ndev; i++) {
  3020. auto sess = static_cast<ggml_hexagon_session *>(devices[i].context);
  3021. delete sess;
  3022. }
  3023. }
  3024. static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
  3025. return "HTP";
  3026. GGML_UNUSED(reg);
  3027. }
  3028. static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
  3029. return opt_ndev;
  3030. GGML_UNUSED(reg);
  3031. }
  3032. static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) {
  3033. auto hreg = static_cast<ggml_hexagon_registry *>(reg->context);
  3034. if (index >= opt_ndev || !hreg->devices[index].context) {
  3035. return nullptr;
  3036. }
  3037. return &hreg->devices[index];
  3038. }
  3039. static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, const char * name) {
  3040. if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
  3041. ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_hexagon_device_get_extra_buffers_type;
  3042. return (void *) fct;
  3043. }
  3044. return NULL;
  3045. }
  3046. static void ggml_hexagon_init(ggml_backend_reg * reg) {
  3047. // Basic sanity checks to make sure definitions match
  3048. static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
  3049. "please update hexagon_type to match ggml_type");
  3050. static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
  3051. "please update hexagon_type to match ggml_type");
  3052. static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
  3053. "please update hexagon_type to match ggml_type");
  3054. const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
  3055. const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
  3056. opt_verbose = str_verbose ? atoi(str_verbose) : 0;
  3057. opt_profile = getenv("GGML_HEXAGON_PROFILE") != nullptr;
  3058. opt_etm = getenv("GGML_HEXAGON_ETM") != nullptr;
  3059. opt_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL") != nullptr;
  3060. const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
  3061. if (str_opmask != nullptr) {
  3062. opt_opmask = strtoul(str_opmask, NULL, 0);
  3063. }
  3064. opt_opsync = getenv("GGML_HEXAGON_OPSYNC") != nullptr;
  3065. const char * str_ndev = getenv("GGML_HEXAGON_NDEV");
  3066. if (str_ndev) {
  3067. opt_ndev = strtoul(str_ndev, NULL, 0);
  3068. if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
  3069. opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
  3070. }
  3071. }
  3072. const char * str_nhvx = getenv("GGML_HEXAGON_NHVX");
  3073. if (str_nhvx) {
  3074. opt_nhvx = strtoul(str_nhvx, NULL, 0);
  3075. }
  3076. const char * str_arch = getenv("GGML_HEXAGON_ARCH");
  3077. if (str_arch) {
  3078. if (str_arch[0] == 'v') {
  3079. str_arch++;
  3080. }
  3081. opt_arch = strtoul(str_arch, NULL, 0);
  3082. }
  3083. opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
  3084. reg->context = new ggml_hexagon_registry(reg);
  3085. HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
  3086. sizeof(struct htp_general_rsp));
  3087. }
  3088. static const struct ggml_backend_reg_i ggml_backend_hexagon_reg_i = {
  3089. /* .get_name = */ ggml_backend_hexagon_reg_get_name,
  3090. /* .get_device_count = */ ggml_backend_hexagon_reg_get_device_count,
  3091. /* .get_device = */ ggml_backend_hexagon_reg_get_device,
  3092. /* .get_proc_address = */ ggml_backend_hexagon_get_proc_address,
  3093. };
  3094. ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
  3095. static bool initialized = false;
  3096. static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION,
  3097. /* .iface = */ ggml_backend_hexagon_reg_i,
  3098. /* .context = */ NULL };
  3099. {
  3100. static std::mutex mutex;
  3101. std::lock_guard<std::mutex> lock(mutex);
  3102. if (!initialized) {
  3103. ggml_hexagon_init(&reg);
  3104. }
  3105. initialized = true;
  3106. }
  3107. return &reg;
  3108. }
  3109. GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg)