ggml-cpu.c 116 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579
  1. #define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
  2. #define _USE_MATH_DEFINES // For M_PI on MSVC
  3. #include "ggml-backend-impl.h"
  4. #include "ggml-backend.h"
  5. #include "traits.h"
  6. #include "ggml-cpu-impl.h"
  7. #include "ggml-cpu.h"
  8. #include "ggml-impl.h"
  9. #include "quants.h"
  10. #include "ggml-threading.h"
  11. #include "unary-ops.h"
  12. #include "binary-ops.h"
  13. #include "vec.h"
  14. #include "ops.h"
  15. #include "ggml.h"
  16. #if defined(_MSC_VER) || defined(__MINGW32__)
  17. #include <malloc.h> // using malloc.h with MSC/MINGW
  18. #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
  19. #include <alloca.h>
  20. #endif
  21. #include <assert.h>
  22. #include <errno.h>
  23. #include <time.h>
  24. #include <math.h>
  25. #include <stdlib.h>
  26. #include <string.h>
  27. #include <stdint.h>
  28. #include <inttypes.h>
  29. #include <stdio.h>
  30. #include <float.h>
  31. #include <limits.h>
  32. #include <stdarg.h>
  33. #include <signal.h>
  34. #if defined(__gnu_linux__)
  35. #include <syscall.h>
  36. #endif
  37. #ifdef GGML_USE_OPENMP
  38. #include <omp.h>
  39. #endif
  40. #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
  41. #undef GGML_USE_LLAMAFILE
  42. #endif
  43. #ifdef GGML_USE_LLAMAFILE
  44. #include "llamafile/sgemm.h"
  45. #endif
  46. // Note: once we move threading into a separate C++ file
  47. // will use std::hardware_destructive_interference_size instead of hardcoding it here
  48. // and we'll use C++ attribute syntax.
  49. #define GGML_CACHE_LINE 64
  50. #if defined(__clang__) || defined(__GNUC__)
  51. #define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
  52. #endif
  53. #if defined(__has_feature)
  54. #if __has_feature(thread_sanitizer)
  55. #define GGML_TSAN_ENABLED 1
  56. #endif
  57. #else // __has_feature
  58. #if defined(__SANITIZE_THREAD__)
  59. #define GGML_TSAN_ENABLED 1
  60. #endif
  61. #endif // __has_feature
  62. #define UNUSED GGML_UNUSED
  63. #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
  64. // precomputed f32 table for f16 (256 KB) (simd-mappings.h)
  65. float ggml_table_f32_f16[1 << 16];
  66. #if defined(__ARM_ARCH)
  67. struct ggml_arm_arch_features_type {
  68. int sve_cnt;
  69. } ggml_arm_arch_features = { 0 };
  70. #endif
  71. #if defined(_WIN32)
  72. #define WIN32_LEAN_AND_MEAN
  73. #ifndef NOMINMAX
  74. #define NOMINMAX
  75. #endif
  76. #include <windows.h>
  77. #if defined(_MSC_VER) && !defined(__clang__)
  78. #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
  79. typedef volatile LONG atomic_int;
  80. typedef atomic_int atomic_bool;
  81. typedef atomic_int atomic_flag;
  82. #define ATOMIC_FLAG_INIT 0
  83. typedef enum {
  84. memory_order_relaxed,
  85. memory_order_consume,
  86. memory_order_acquire,
  87. memory_order_release,
  88. memory_order_acq_rel,
  89. memory_order_seq_cst
  90. } memory_order;
  91. static void atomic_store(atomic_int * ptr, LONG val) {
  92. InterlockedExchange(ptr, val);
  93. }
  94. static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
  95. // TODO: add support for explicit memory order
  96. InterlockedExchange(ptr, val);
  97. }
  98. static LONG atomic_load(atomic_int * ptr) {
  99. return InterlockedCompareExchange(ptr, 0, 0);
  100. }
  101. static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
  102. // TODO: add support for explicit memory order
  103. return InterlockedCompareExchange(ptr, 0, 0);
  104. }
  105. static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
  106. return InterlockedExchangeAdd(ptr, inc);
  107. }
  108. static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
  109. // TODO: add support for explicit memory order
  110. return InterlockedExchangeAdd(ptr, inc);
  111. }
  112. static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
  113. return InterlockedExchange(ptr, 1);
  114. }
  115. static void atomic_flag_clear(atomic_flag * ptr) {
  116. InterlockedExchange(ptr, 0);
  117. }
  118. static void atomic_thread_fence(memory_order mo) {
  119. MemoryBarrier();
  120. }
  121. #else // clang
  122. #include <stdatomic.h>
  123. #endif
  124. typedef HANDLE pthread_t;
  125. typedef DWORD thread_ret_t;
  126. static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
  127. (void) unused;
  128. HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
  129. if (handle == NULL)
  130. {
  131. return EAGAIN;
  132. }
  133. *out = handle;
  134. return 0;
  135. }
  136. static int pthread_join(pthread_t thread, void * unused) {
  137. (void) unused;
  138. int ret = (int) WaitForSingleObject(thread, INFINITE);
  139. CloseHandle(thread);
  140. return ret;
  141. }
  142. static int sched_yield (void) {
  143. Sleep (0);
  144. return 0;
  145. }
  146. #else
  147. #include <pthread.h>
  148. #include <stdatomic.h>
  149. #include <sched.h>
  150. #if defined(__FreeBSD__)
  151. #include <pthread_np.h>
  152. #endif
  153. typedef void * thread_ret_t;
  154. #include <sys/types.h>
  155. #include <sys/stat.h>
  156. #include <unistd.h>
  157. #endif
  158. typedef pthread_t ggml_thread_t;
  159. #if defined(__APPLE__)
  160. #include <unistd.h>
  161. #include <mach/mach.h>
  162. #include <TargetConditionals.h>
  163. #endif
  164. static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
  165. [GGML_TYPE_F32] = {
  166. .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
  167. .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
  168. .vec_dot_type = GGML_TYPE_F32,
  169. .nrows = 1,
  170. },
  171. [GGML_TYPE_F16] = {
  172. .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
  173. .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
  174. .vec_dot_type = GGML_TYPE_F16,
  175. .nrows = 1,
  176. },
  177. [GGML_TYPE_Q4_0] = {
  178. .from_float = quantize_row_q4_0,
  179. .vec_dot = ggml_vec_dot_q4_0_q8_0,
  180. .vec_dot_type = GGML_TYPE_Q8_0,
  181. #if defined (__ARM_FEATURE_MATMUL_INT8)
  182. .nrows = 2,
  183. #else
  184. .nrows = 1,
  185. #endif
  186. },
  187. [GGML_TYPE_Q4_1] = {
  188. .from_float = quantize_row_q4_1,
  189. .vec_dot = ggml_vec_dot_q4_1_q8_1,
  190. .vec_dot_type = GGML_TYPE_Q8_1,
  191. #if defined (__ARM_FEATURE_MATMUL_INT8)
  192. .nrows = 2,
  193. #else
  194. .nrows = 1,
  195. #endif
  196. },
  197. [GGML_TYPE_Q5_0] = {
  198. .from_float = quantize_row_q5_0,
  199. .vec_dot = ggml_vec_dot_q5_0_q8_0,
  200. .vec_dot_type = GGML_TYPE_Q8_0,
  201. .nrows = 1,
  202. },
  203. [GGML_TYPE_Q5_1] = {
  204. .from_float = quantize_row_q5_1,
  205. .vec_dot = ggml_vec_dot_q5_1_q8_1,
  206. .vec_dot_type = GGML_TYPE_Q8_1,
  207. .nrows = 1,
  208. },
  209. [GGML_TYPE_Q8_0] = {
  210. .from_float = quantize_row_q8_0,
  211. .vec_dot = ggml_vec_dot_q8_0_q8_0,
  212. .vec_dot_type = GGML_TYPE_Q8_0,
  213. #if defined (__ARM_FEATURE_MATMUL_INT8)
  214. .nrows = 2,
  215. #else
  216. .nrows = 1,
  217. #endif
  218. },
  219. [GGML_TYPE_Q8_1] = {
  220. .from_float = quantize_row_q8_1,
  221. .vec_dot_type = GGML_TYPE_Q8_1,
  222. .nrows = 1,
  223. },
  224. [GGML_TYPE_MXFP4] = {
  225. .from_float = quantize_row_mxfp4,
  226. .vec_dot = ggml_vec_dot_mxfp4_q8_0,
  227. .vec_dot_type = GGML_TYPE_Q8_0,
  228. .nrows = 1,
  229. },
  230. [GGML_TYPE_Q2_K] = {
  231. .from_float = quantize_row_q2_K,
  232. .vec_dot = ggml_vec_dot_q2_K_q8_K,
  233. .vec_dot_type = GGML_TYPE_Q8_K,
  234. .nrows = 1,
  235. },
  236. [GGML_TYPE_Q3_K] = {
  237. .from_float = quantize_row_q3_K,
  238. .vec_dot = ggml_vec_dot_q3_K_q8_K,
  239. .vec_dot_type = GGML_TYPE_Q8_K,
  240. .nrows = 1,
  241. },
  242. [GGML_TYPE_Q4_K] = {
  243. .from_float = quantize_row_q4_K,
  244. .vec_dot = ggml_vec_dot_q4_K_q8_K,
  245. .vec_dot_type = GGML_TYPE_Q8_K,
  246. #if defined (__ARM_FEATURE_MATMUL_INT8)
  247. .nrows = 2,
  248. #else
  249. .nrows = 1,
  250. #endif
  251. },
  252. [GGML_TYPE_Q5_K] = {
  253. .from_float = quantize_row_q5_K,
  254. .vec_dot = ggml_vec_dot_q5_K_q8_K,
  255. .vec_dot_type = GGML_TYPE_Q8_K,
  256. .nrows = 1,
  257. },
  258. [GGML_TYPE_Q6_K] = {
  259. .from_float = quantize_row_q6_K,
  260. .vec_dot = ggml_vec_dot_q6_K_q8_K,
  261. .vec_dot_type = GGML_TYPE_Q8_K,
  262. #if defined (__ARM_FEATURE_MATMUL_INT8)
  263. .nrows = 2,
  264. #else
  265. .nrows = 1,
  266. #endif
  267. },
  268. [GGML_TYPE_IQ2_XXS] = {
  269. .from_float = NULL,
  270. .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
  271. .vec_dot_type = GGML_TYPE_Q8_K,
  272. .nrows = 1,
  273. },
  274. [GGML_TYPE_IQ2_XS] = {
  275. .from_float = NULL,
  276. .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
  277. .vec_dot_type = GGML_TYPE_Q8_K,
  278. .nrows = 1,
  279. },
  280. [GGML_TYPE_IQ3_XXS] = {
  281. // NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init
  282. //.from_float = quantize_row_iq3_xxs,
  283. .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
  284. .vec_dot_type = GGML_TYPE_Q8_K,
  285. .nrows = 1,
  286. },
  287. [GGML_TYPE_IQ3_S] = {
  288. //.from_float = quantize_row_iq3_s,
  289. .vec_dot = ggml_vec_dot_iq3_s_q8_K,
  290. .vec_dot_type = GGML_TYPE_Q8_K,
  291. .nrows = 1,
  292. },
  293. [GGML_TYPE_IQ2_S] = {
  294. //.from_float = quantize_row_iq2_s,
  295. .vec_dot = ggml_vec_dot_iq2_s_q8_K,
  296. .vec_dot_type = GGML_TYPE_Q8_K,
  297. .nrows = 1,
  298. },
  299. [GGML_TYPE_IQ1_S] = {
  300. .from_float = NULL,
  301. .vec_dot = ggml_vec_dot_iq1_s_q8_K,
  302. .vec_dot_type = GGML_TYPE_Q8_K,
  303. .nrows = 1,
  304. },
  305. [GGML_TYPE_IQ1_M] = {
  306. .from_float = NULL,
  307. .vec_dot = ggml_vec_dot_iq1_m_q8_K,
  308. .vec_dot_type = GGML_TYPE_Q8_K,
  309. .nrows = 1,
  310. },
  311. [GGML_TYPE_IQ4_NL] = {
  312. .from_float = quantize_row_iq4_nl,
  313. .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
  314. .vec_dot_type = GGML_TYPE_Q8_0,
  315. .nrows = 1,
  316. },
  317. [GGML_TYPE_IQ4_XS] = {
  318. .from_float = quantize_row_iq4_xs,
  319. .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
  320. .vec_dot_type = GGML_TYPE_Q8_K,
  321. .nrows = 1,
  322. },
  323. [GGML_TYPE_Q8_K] = {
  324. .from_float = quantize_row_q8_K,
  325. },
  326. [GGML_TYPE_BF16] = {
  327. .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
  328. .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
  329. .vec_dot_type = GGML_TYPE_BF16,
  330. .nrows = 1,
  331. },
  332. [GGML_TYPE_TQ1_0] = {
  333. .from_float = quantize_row_tq1_0,
  334. .vec_dot = ggml_vec_dot_tq1_0_q8_K,
  335. .vec_dot_type = GGML_TYPE_Q8_K,
  336. .nrows = 1,
  337. },
  338. [GGML_TYPE_TQ2_0] = {
  339. .from_float = quantize_row_tq2_0,
  340. .vec_dot = ggml_vec_dot_tq2_0_q8_K,
  341. .vec_dot_type = GGML_TYPE_Q8_K,
  342. .nrows = 1,
  343. },
  344. [GGML_TYPE_I32] = {
  345. .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
  346. },
  347. };
  348. const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
  349. return &type_traits_cpu[type];
  350. }
  351. //
  352. // Threading defs
  353. //
  354. typedef pthread_t ggml_thread_t;
  355. #if defined(_WIN32)
  356. typedef CONDITION_VARIABLE ggml_cond_t;
  357. typedef SRWLOCK ggml_mutex_t;
  358. #define ggml_mutex_init(m) InitializeSRWLock(m)
  359. #define ggml_mutex_destroy(m)
  360. #define ggml_mutex_lock(m) AcquireSRWLockExclusive(m)
  361. #define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
  362. #define ggml_mutex_lock_shared(m) AcquireSRWLockShared(m)
  363. #define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
  364. #define ggml_cond_init(c) InitializeConditionVariable(c)
  365. #define ggml_cond_destroy(c)
  366. #define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
  367. #define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
  368. #define ggml_thread_create pthread_create
  369. #define ggml_thread_join pthread_join
  370. #else
  371. typedef pthread_cond_t ggml_cond_t;
  372. typedef pthread_mutex_t ggml_mutex_t;
  373. #define ggml_mutex_init(m) pthread_mutex_init(m, NULL)
  374. #define ggml_mutex_destroy(m) pthread_mutex_destroy(m)
  375. #define ggml_mutex_lock(m) pthread_mutex_lock(m)
  376. #define ggml_mutex_unlock(m) pthread_mutex_unlock(m)
  377. #define ggml_mutex_lock_shared(m) pthread_mutex_lock(m)
  378. #define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
  379. #define ggml_lock_init(x) UNUSED(x)
  380. #define ggml_lock_destroy(x) UNUSED(x)
  381. #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
  382. #define ggml_lock_lock(x) _mm_pause()
  383. #else
  384. #define ggml_lock_lock(x) UNUSED(x)
  385. #endif
  386. #define ggml_lock_unlock(x) UNUSED(x)
  387. #define GGML_LOCK_INITIALIZER 0
  388. #define ggml_cond_init(c) pthread_cond_init(c, NULL)
  389. #define ggml_cond_destroy(c) pthread_cond_destroy(c)
  390. #define ggml_cond_wait(c, m) pthread_cond_wait(c, m)
  391. #define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
  392. #define ggml_thread_create pthread_create
  393. #define ggml_thread_join pthread_join
  394. #endif
  395. // Threadpool def
  396. struct ggml_threadpool {
  397. ggml_mutex_t mutex; // mutex for cond.var
  398. ggml_cond_t cond; // cond.var for waiting for new work
  399. struct ggml_cgraph * cgraph;
  400. struct ggml_cplan * cplan;
  401. // synchronization primitives
  402. atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
  403. atomic_int GGML_CACHE_ALIGN n_barrier;
  404. atomic_int GGML_CACHE_ALIGN n_barrier_passed;
  405. atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
  406. // these are atomic as an annotation for thread-sanitizer
  407. atomic_bool stop; // Used for stopping the threadpool altogether
  408. atomic_bool pause; // Used for pausing the threadpool or individual threads
  409. atomic_int abort; // Used for aborting processing of a graph
  410. struct ggml_compute_state * workers; // per thread state
  411. int n_threads_max; // number of threads in the pool
  412. atomic_int n_threads_cur; // number of threads used in the current graph
  413. int32_t prio; // Scheduling priority
  414. uint32_t poll; // Polling level (0 - no polling)
  415. enum ggml_status ec;
  416. };
  417. // Per-thread state
  418. struct ggml_compute_state {
  419. #ifndef GGML_USE_OPENMP
  420. ggml_thread_t thrd;
  421. int last_graph;
  422. bool pending;
  423. #endif
  424. bool cpumask[GGML_MAX_N_THREADS];
  425. struct ggml_threadpool * threadpool;
  426. int ith;
  427. };
  428. // Helpers for polling loops
  429. #if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
  430. static inline void ggml_thread_cpu_relax(void) {
  431. __asm__ volatile("yield" ::: "memory");
  432. }
  433. #elif defined(__x86_64__)
  434. static inline void ggml_thread_cpu_relax(void) {
  435. _mm_pause();
  436. }
  437. #else
  438. static inline void ggml_thread_cpu_relax(void) {;}
  439. #endif
  440. //
  441. // NUMA support
  442. //
  443. #define GGML_NUMA_MAX_NODES 8
  444. #define GGML_NUMA_MAX_CPUS 512
  445. struct ggml_numa_node {
  446. uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
  447. uint32_t n_cpus;
  448. };
  449. struct ggml_numa_nodes {
  450. enum ggml_numa_strategy numa_strategy;
  451. struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
  452. uint32_t n_nodes;
  453. uint32_t total_cpus; // hardware threads on system
  454. uint32_t current_node; // node on which main process is execting
  455. #if defined(__gnu_linux__)
  456. cpu_set_t cpuset; // cpuset from numactl
  457. #else
  458. uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
  459. #endif
  460. };
  461. //
  462. // ggml state
  463. //
  464. struct ggml_state {
  465. struct ggml_numa_nodes numa;
  466. };
  467. static struct ggml_state g_state = {0};
  468. void ggml_barrier(struct ggml_threadpool * tp) {
  469. int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
  470. if (n_threads == 1) {
  471. return;
  472. }
  473. #ifdef GGML_USE_OPENMP
  474. #pragma omp barrier
  475. #else
  476. int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
  477. // enter barrier (full seq-cst fence)
  478. int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
  479. if (n_barrier == (n_threads - 1)) {
  480. // last thread
  481. atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
  482. // exit barrier (fill seq-cst fence)
  483. atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
  484. return;
  485. }
  486. // wait for other threads
  487. while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
  488. ggml_thread_cpu_relax();
  489. }
  490. // exit barrier (full seq-cst fence)
  491. // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
  492. #ifdef GGML_TSAN_ENABLED
  493. atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
  494. #else
  495. atomic_thread_fence(memory_order_seq_cst);
  496. #endif
  497. #endif
  498. }
  499. void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
  500. atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
  501. }
  502. int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
  503. return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
  504. }
  505. #if defined(__gnu_linux__)
  506. static cpu_set_t ggml_get_numa_affinity(void) {
  507. cpu_set_t cpuset;
  508. pthread_t thread;
  509. thread = pthread_self();
  510. CPU_ZERO(&cpuset);
  511. pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
  512. return cpuset;
  513. }
  514. #else
  515. static uint32_t ggml_get_numa_affinity(void) {
  516. return 0; // no NUMA support
  517. }
  518. #endif
  519. void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
  520. if (g_state.numa.n_nodes > 0) {
  521. fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
  522. return;
  523. }
  524. #if defined(__gnu_linux__)
  525. struct stat st;
  526. char path[256];
  527. int rv;
  528. // set numa scheme
  529. g_state.numa.numa_strategy = numa_flag;
  530. GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
  531. g_state.numa.cpuset = ggml_get_numa_affinity();
  532. // enumerate nodes
  533. while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
  534. rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
  535. GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
  536. if (stat(path, &st) != 0) { break; }
  537. ++g_state.numa.n_nodes;
  538. }
  539. // enumerate CPUs
  540. while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
  541. rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
  542. GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
  543. if (stat(path, &st) != 0) { break; }
  544. ++g_state.numa.total_cpus;
  545. }
  546. GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
  547. // figure out which node we're on
  548. uint current_cpu;
  549. int getcpu_ret = 0;
  550. #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
  551. getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
  552. #else
  553. // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
  554. # if !defined(SYS_getcpu) && defined(SYS_get_cpu)
  555. # define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
  556. # endif
  557. getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
  558. #endif
  559. if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
  560. g_state.numa.n_nodes = 0;
  561. return;
  562. }
  563. GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
  564. for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
  565. struct ggml_numa_node * node = &g_state.numa.nodes[n];
  566. GGML_PRINT_DEBUG("CPUs on node %u:", n);
  567. node->n_cpus = 0;
  568. for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
  569. rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
  570. GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
  571. if (stat(path, &st) == 0) {
  572. node->cpus[node->n_cpus++] = c;
  573. GGML_PRINT_DEBUG(" %u", c);
  574. }
  575. }
  576. GGML_PRINT_DEBUG("\n");
  577. }
  578. if (ggml_is_numa()) {
  579. FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
  580. if (fptr != NULL) {
  581. char buf[42];
  582. if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
  583. GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
  584. }
  585. fclose(fptr);
  586. }
  587. }
  588. #else
  589. UNUSED(numa_flag);
  590. // TODO
  591. #endif
  592. }
  593. bool ggml_is_numa(void) {
  594. return g_state.numa.n_nodes > 1;
  595. }
  596. #if defined(__ARM_ARCH)
  597. #if defined(__linux__) && defined(__aarch64__)
  598. #include <sys/auxv.h>
  599. #endif
  600. static void ggml_init_arm_arch_features(void) {
  601. #if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
  602. ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
  603. #endif
  604. }
  605. #endif // __ARM_ARCH
  606. struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
  607. GGML_ASSERT(!ggml_get_no_alloc(ctx));
  608. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
  609. ggml_set_i32(result, value);
  610. return result;
  611. }
  612. struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
  613. GGML_ASSERT(!ggml_get_no_alloc(ctx));
  614. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
  615. ggml_set_f32(result, value);
  616. return result;
  617. }
  618. struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
  619. const int n = ggml_nrows(tensor);
  620. const int nc = tensor->ne[0];
  621. const size_t n1 = tensor->nb[1];
  622. char * const data = tensor->data;
  623. switch (tensor->type) {
  624. case GGML_TYPE_I8:
  625. {
  626. assert(tensor->nb[0] == sizeof(int8_t));
  627. for (int i = 0; i < n; i++) {
  628. ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
  629. }
  630. } break;
  631. case GGML_TYPE_I16:
  632. {
  633. assert(tensor->nb[0] == sizeof(int16_t));
  634. for (int i = 0; i < n; i++) {
  635. ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
  636. }
  637. } break;
  638. case GGML_TYPE_I32:
  639. {
  640. assert(tensor->nb[0] == sizeof(int32_t));
  641. for (int i = 0; i < n; i++) {
  642. ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
  643. }
  644. } break;
  645. case GGML_TYPE_F16:
  646. {
  647. assert(tensor->nb[0] == sizeof(ggml_fp16_t));
  648. for (int i = 0; i < n; i++) {
  649. ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
  650. }
  651. } break;
  652. case GGML_TYPE_BF16:
  653. {
  654. assert(tensor->nb[0] == sizeof(ggml_fp16_t));
  655. for (int i = 0; i < n; i++) {
  656. ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
  657. }
  658. } break;
  659. case GGML_TYPE_F32:
  660. {
  661. assert(tensor->nb[0] == sizeof(float));
  662. for (int i = 0; i < n; i++) {
  663. ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
  664. }
  665. } break;
  666. default:
  667. {
  668. GGML_ABORT("fatal error");
  669. }
  670. }
  671. return tensor;
  672. }
  673. struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
  674. const int n = ggml_nrows(tensor);
  675. const int nc = tensor->ne[0];
  676. const size_t n1 = tensor->nb[1];
  677. char * const data = tensor->data;
  678. switch (tensor->type) {
  679. case GGML_TYPE_I8:
  680. {
  681. assert(tensor->nb[0] == sizeof(int8_t));
  682. for (int i = 0; i < n; i++) {
  683. ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
  684. }
  685. } break;
  686. case GGML_TYPE_I16:
  687. {
  688. assert(tensor->nb[0] == sizeof(int16_t));
  689. for (int i = 0; i < n; i++) {
  690. ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
  691. }
  692. } break;
  693. case GGML_TYPE_I32:
  694. {
  695. assert(tensor->nb[0] == sizeof(int32_t));
  696. for (int i = 0; i < n; i++) {
  697. ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
  698. }
  699. } break;
  700. case GGML_TYPE_F16:
  701. {
  702. assert(tensor->nb[0] == sizeof(ggml_fp16_t));
  703. for (int i = 0; i < n; i++) {
  704. ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
  705. }
  706. } break;
  707. case GGML_TYPE_BF16:
  708. {
  709. assert(tensor->nb[0] == sizeof(ggml_bf16_t));
  710. for (int i = 0; i < n; i++) {
  711. ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
  712. }
  713. } break;
  714. case GGML_TYPE_F32:
  715. {
  716. assert(tensor->nb[0] == sizeof(float));
  717. for (int i = 0; i < n; i++) {
  718. ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
  719. }
  720. } break;
  721. default:
  722. {
  723. GGML_ABORT("fatal error");
  724. }
  725. }
  726. return tensor;
  727. }
  728. int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
  729. if (!ggml_is_contiguous(tensor)) {
  730. int64_t id[4] = { 0, 0, 0, 0 };
  731. ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
  732. return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
  733. }
  734. switch (tensor->type) {
  735. case GGML_TYPE_I8:
  736. {
  737. GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
  738. return ((int8_t *)(tensor->data))[i];
  739. }
  740. case GGML_TYPE_I16:
  741. {
  742. GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
  743. return ((int16_t *)(tensor->data))[i];
  744. }
  745. case GGML_TYPE_I32:
  746. {
  747. GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
  748. return ((int32_t *)(tensor->data))[i];
  749. }
  750. case GGML_TYPE_F16:
  751. {
  752. GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
  753. return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
  754. }
  755. case GGML_TYPE_BF16:
  756. {
  757. GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
  758. return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
  759. }
  760. case GGML_TYPE_F32:
  761. {
  762. GGML_ASSERT(tensor->nb[0] == sizeof(float));
  763. return ((float *)(tensor->data))[i];
  764. }
  765. default:
  766. {
  767. GGML_ABORT("fatal error");
  768. }
  769. }
  770. }
  771. void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
  772. if (!ggml_is_contiguous(tensor)) {
  773. int64_t id[4] = { 0, 0, 0, 0 };
  774. ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
  775. ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
  776. return;
  777. }
  778. switch (tensor->type) {
  779. case GGML_TYPE_I8:
  780. {
  781. GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
  782. ((int8_t *)(tensor->data))[i] = value;
  783. } break;
  784. case GGML_TYPE_I16:
  785. {
  786. GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
  787. ((int16_t *)(tensor->data))[i] = value;
  788. } break;
  789. case GGML_TYPE_I32:
  790. {
  791. GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
  792. ((int32_t *)(tensor->data))[i] = value;
  793. } break;
  794. case GGML_TYPE_F16:
  795. {
  796. GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
  797. ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
  798. } break;
  799. case GGML_TYPE_BF16:
  800. {
  801. GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
  802. ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
  803. } break;
  804. case GGML_TYPE_F32:
  805. {
  806. GGML_ASSERT(tensor->nb[0] == sizeof(float));
  807. ((float *)(tensor->data))[i] = value;
  808. } break;
  809. default:
  810. {
  811. GGML_ABORT("fatal error");
  812. }
  813. }
  814. }
  815. int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
  816. void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
  817. switch (tensor->type) {
  818. case GGML_TYPE_I8:
  819. return ((int8_t *) data)[0];
  820. case GGML_TYPE_I16:
  821. return ((int16_t *) data)[0];
  822. case GGML_TYPE_I32:
  823. return ((int32_t *) data)[0];
  824. case GGML_TYPE_F16:
  825. return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
  826. case GGML_TYPE_BF16:
  827. return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
  828. case GGML_TYPE_F32:
  829. return ((float *) data)[0];
  830. default:
  831. GGML_ABORT("fatal error");
  832. }
  833. }
  834. void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
  835. void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
  836. switch (tensor->type) {
  837. case GGML_TYPE_I8:
  838. {
  839. ((int8_t *)(data))[0] = value;
  840. } break;
  841. case GGML_TYPE_I16:
  842. {
  843. ((int16_t *)(data))[0] = value;
  844. } break;
  845. case GGML_TYPE_I32:
  846. {
  847. ((int32_t *)(data))[0] = value;
  848. } break;
  849. case GGML_TYPE_F16:
  850. {
  851. ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
  852. } break;
  853. case GGML_TYPE_BF16:
  854. {
  855. ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
  856. } break;
  857. case GGML_TYPE_F32:
  858. {
  859. ((float *)(data))[0] = value;
  860. } break;
  861. default:
  862. {
  863. GGML_ABORT("fatal error");
  864. }
  865. }
  866. }
  867. float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
  868. if (!ggml_is_contiguous(tensor)) {
  869. int64_t id[4] = { 0, 0, 0, 0 };
  870. ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
  871. return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
  872. }
  873. switch (tensor->type) {
  874. case GGML_TYPE_I8:
  875. {
  876. return ((int8_t *)(tensor->data))[i];
  877. }
  878. case GGML_TYPE_I16:
  879. {
  880. return ((int16_t *)(tensor->data))[i];
  881. }
  882. case GGML_TYPE_I32:
  883. {
  884. return ((int32_t *)(tensor->data))[i];
  885. }
  886. case GGML_TYPE_F16:
  887. {
  888. return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
  889. }
  890. case GGML_TYPE_BF16:
  891. {
  892. return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
  893. }
  894. case GGML_TYPE_F32:
  895. {
  896. return ((float *)(tensor->data))[i];
  897. }
  898. default:
  899. {
  900. GGML_ABORT("fatal error");
  901. }
  902. }
  903. }
  904. void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
  905. if (!ggml_is_contiguous(tensor)) {
  906. int64_t id[4] = { 0, 0, 0, 0 };
  907. ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
  908. ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
  909. return;
  910. }
  911. switch (tensor->type) {
  912. case GGML_TYPE_I8:
  913. {
  914. ((int8_t *)(tensor->data))[i] = value;
  915. } break;
  916. case GGML_TYPE_I16:
  917. {
  918. ((int16_t *)(tensor->data))[i] = value;
  919. } break;
  920. case GGML_TYPE_I32:
  921. {
  922. ((int32_t *)(tensor->data))[i] = value;
  923. } break;
  924. case GGML_TYPE_F16:
  925. {
  926. ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
  927. } break;
  928. case GGML_TYPE_BF16:
  929. {
  930. ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
  931. } break;
  932. case GGML_TYPE_F32:
  933. {
  934. ((float *)(tensor->data))[i] = value;
  935. } break;
  936. default:
  937. {
  938. GGML_ABORT("fatal error");
  939. }
  940. }
  941. }
  942. float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
  943. void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
  944. switch (tensor->type) {
  945. case GGML_TYPE_I8:
  946. return ((int8_t *) data)[0];
  947. case GGML_TYPE_I16:
  948. return ((int16_t *) data)[0];
  949. case GGML_TYPE_I32:
  950. return ((int32_t *) data)[0];
  951. case GGML_TYPE_F16:
  952. return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
  953. case GGML_TYPE_BF16:
  954. return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
  955. case GGML_TYPE_F32:
  956. return ((float *) data)[0];
  957. default:
  958. GGML_ABORT("fatal error");
  959. }
  960. }
  961. void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
  962. void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
  963. switch (tensor->type) {
  964. case GGML_TYPE_I8:
  965. {
  966. ((int8_t *)(data))[0] = value;
  967. } break;
  968. case GGML_TYPE_I16:
  969. {
  970. ((int16_t *)(data))[0] = value;
  971. } break;
  972. case GGML_TYPE_I32:
  973. {
  974. ((int32_t *)(data))[0] = value;
  975. } break;
  976. case GGML_TYPE_F16:
  977. {
  978. ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
  979. } break;
  980. case GGML_TYPE_BF16:
  981. {
  982. ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
  983. } break;
  984. case GGML_TYPE_F32:
  985. {
  986. ((float *)(data))[0] = value;
  987. } break;
  988. default:
  989. {
  990. GGML_ABORT("fatal error");
  991. }
  992. }
  993. }
  994. ////////////////////////////////////////////////////////////////////////////////
  995. // ggml_compute_forward_mul_mat
  996. static void ggml_compute_forward_mul_mat_one_chunk(
  997. const struct ggml_compute_params * params,
  998. struct ggml_tensor * dst,
  999. const enum ggml_type type,
  1000. const int64_t num_rows_per_vec_dot,
  1001. const int64_t ir0_start,
  1002. const int64_t ir0_end,
  1003. const int64_t ir1_start,
  1004. const int64_t ir1_end) {
  1005. const struct ggml_tensor * src0 = dst->src[0];
  1006. const struct ggml_tensor * src1 = dst->src[1];
  1007. GGML_TENSOR_BINARY_OP_LOCALS
  1008. const bool src1_cont = ggml_is_contiguous(src1);
  1009. ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
  1010. enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
  1011. // broadcast factors
  1012. const int64_t r2 = ne12 / ne02;
  1013. const int64_t r3 = ne13 / ne03;
  1014. //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
  1015. // threads with no work simply yield (not sure if it helps)
  1016. if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
  1017. return;
  1018. }
  1019. const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
  1020. const size_t row_size = ggml_row_size(vec_dot_type, ne10);
  1021. assert(ne12 % ne02 == 0);
  1022. assert(ne13 % ne03 == 0);
  1023. // block-tiling attempt
  1024. const int64_t blck_0 = 16;
  1025. const int64_t blck_1 = 16;
  1026. const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
  1027. // attempt to reduce false-sharing (does not seem to make a difference)
  1028. // 16 * 2, accounting for mmla kernels
  1029. float tmp[32];
  1030. for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
  1031. for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
  1032. for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
  1033. const int64_t i13 = (ir1 / (ne12 * ne1));
  1034. const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
  1035. const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
  1036. // broadcast src0 into src1
  1037. const int64_t i03 = i13 / r3;
  1038. const int64_t i02 = i12 / r2;
  1039. const int64_t i1 = i11;
  1040. const int64_t i2 = i12;
  1041. const int64_t i3 = i13;
  1042. const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
  1043. // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
  1044. // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
  1045. // the original src1 data pointer, so we should index using the indices directly
  1046. // TODO: this is a bit of a hack, we should probably have a better way to handle this
  1047. const char * src1_col = (const char*)wdata +
  1048. (src1_cont || src1->type != vec_dot_type
  1049. ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
  1050. : (i11 * nb11 + i12 * nb12 + i13 * nb13));
  1051. float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
  1052. //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
  1053. // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
  1054. //}
  1055. for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
  1056. vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
  1057. }
  1058. for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
  1059. memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
  1060. }
  1061. }
  1062. }
  1063. }
  1064. }
  1065. void ggml_compute_forward_mul_mat(
  1066. const struct ggml_compute_params * params,
  1067. struct ggml_tensor * dst) {
  1068. const struct ggml_tensor * src0 = dst->src[0];
  1069. const struct ggml_tensor * src1 = dst->src[1];
  1070. GGML_TENSOR_BINARY_OP_LOCALS
  1071. const int ith = params->ith;
  1072. const int nth = params->nth;
  1073. enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
  1074. ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
  1075. int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
  1076. GGML_ASSERT(ne0 == ne01);
  1077. GGML_ASSERT(ne1 == ne11);
  1078. GGML_ASSERT(ne2 == ne12);
  1079. GGML_ASSERT(ne3 == ne13);
  1080. // we don't support permuted src0 or src1
  1081. GGML_ASSERT(nb00 == ggml_type_size(src0->type));
  1082. GGML_ASSERT(nb10 == ggml_type_size(src1->type));
  1083. // dst cannot be transposed or permuted
  1084. GGML_ASSERT(nb0 == sizeof(float));
  1085. GGML_ASSERT(nb0 <= nb1);
  1086. GGML_ASSERT(nb1 <= nb2);
  1087. GGML_ASSERT(nb2 <= nb3);
  1088. // nb01 >= nb00 - src0 is not transposed
  1089. // compute by src0 rows
  1090. // TODO: extract to "extra_op"
  1091. #if GGML_USE_LLAMAFILE
  1092. // broadcast factors
  1093. const int64_t r2 = ne12 / ne02;
  1094. const int64_t r3 = ne13 / ne03;
  1095. const bool src1_cont = ggml_is_contiguous(src1);
  1096. if (src1_cont) {
  1097. for (int64_t i13 = 0; i13 < ne13; i13++)
  1098. for (int64_t i12 = 0; i12 < ne12; i12++)
  1099. if (!llamafile_sgemm(params,
  1100. ne01, ne11, ne00/ggml_blck_size(src0->type),
  1101. (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
  1102. nb01/ggml_type_size(src0->type),
  1103. (const char *)src1->data + i12*nb12 + i13*nb13,
  1104. nb11/ggml_type_size(src1->type),
  1105. (char *)dst->data + i12*nb2 + i13*nb3,
  1106. nb1/ggml_type_size(dst->type),
  1107. src0->type,
  1108. src1->type,
  1109. dst->type))
  1110. goto UseGgmlGemm1;
  1111. return;
  1112. }
  1113. UseGgmlGemm1:;
  1114. #endif
  1115. if (src1->type != vec_dot_type) {
  1116. char * wdata = params->wdata;
  1117. const size_t nbw0 = ggml_type_size(vec_dot_type);
  1118. const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
  1119. const size_t nbw2 = nbw1*ne11;
  1120. const size_t nbw3 = nbw2*ne12;
  1121. assert(params->wsize >= ne13*nbw3);
  1122. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  1123. #if 0
  1124. for (int64_t i13 = 0; i13 < ne13; ++i13) {
  1125. for (int64_t i12 = 0; i12 < ne12; ++i12) {
  1126. for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
  1127. from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
  1128. (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
  1129. ne10);
  1130. }
  1131. }
  1132. }
  1133. #else
  1134. for (int64_t i13 = 0; i13 < ne13; ++i13) {
  1135. for (int64_t i12 = 0; i12 < ne12; ++i12) {
  1136. for (int64_t i11 = 0; i11 < ne11; ++i11) {
  1137. size_t bs = ggml_blck_size(vec_dot_type);
  1138. int64_t ne10_block_start = (ith * ne10/bs) / nth;
  1139. int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
  1140. from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
  1141. (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
  1142. (ne10_block_end - ne10_block_start) * bs);
  1143. }
  1144. }
  1145. }
  1146. #endif
  1147. }
  1148. if (ith == 0) {
  1149. // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
  1150. atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
  1151. }
  1152. ggml_barrier(params->threadpool);
  1153. #if GGML_USE_LLAMAFILE
  1154. if (src1->type != vec_dot_type) {
  1155. const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
  1156. const size_t row_size = ggml_row_size(vec_dot_type, ne10);
  1157. for (int64_t i13 = 0; i13 < ne13; i13++)
  1158. for (int64_t i12 = 0; i12 < ne12; i12++)
  1159. if (!llamafile_sgemm(params,
  1160. ne01, ne11, ne00/ggml_blck_size(src0->type),
  1161. (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
  1162. nb01/ggml_type_size(src0->type),
  1163. (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
  1164. row_size/ggml_type_size(vec_dot_type),
  1165. (char *)dst->data + i12*nb2 + i13*nb3,
  1166. nb1/ggml_type_size(dst->type),
  1167. src0->type,
  1168. vec_dot_type,
  1169. dst->type))
  1170. goto UseGgmlGemm2;
  1171. return;
  1172. }
  1173. UseGgmlGemm2:;
  1174. #endif
  1175. // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
  1176. const int64_t nr0 = ne0;
  1177. // This is the size of the rest of the dimensions of the result
  1178. const int64_t nr1 = ne1 * ne2 * ne3;
  1179. // Now select a reasonable chunk size.
  1180. int chunk_size = 16;
  1181. // We need to step up the size if it's small
  1182. if (nr0 == 1 || nr1 == 1) {
  1183. chunk_size = 64;
  1184. }
  1185. // distribute the work across the inner or outer loop based on which one is larger
  1186. // The number of chunks in the 0/1 dim.
  1187. // CEIL(nr0/chunk_size)
  1188. int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
  1189. int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
  1190. // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
  1191. // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
  1192. // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
  1193. if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
  1194. // distribute the thread work across the inner or outer loop based on which one is larger
  1195. nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
  1196. nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
  1197. }
  1198. // The number of elements in each chunk
  1199. const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
  1200. const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
  1201. // The first chunk comes from our thread_id, the rest will get auto-assigned.
  1202. int current_chunk = ith;
  1203. while (current_chunk < nchunk0 * nchunk1) {
  1204. const int64_t ith0 = current_chunk % nchunk0;
  1205. const int64_t ith1 = current_chunk / nchunk0;
  1206. const int64_t ir0_start = dr0 * ith0;
  1207. const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
  1208. const int64_t ir1_start = dr1 * ith1;
  1209. const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
  1210. // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
  1211. int64_t num_rows_per_vec_dot = vec_dot_num_rows;
  1212. // these checks are needed to avoid crossing dim1 boundaries
  1213. // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
  1214. if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
  1215. num_rows_per_vec_dot = 1;
  1216. }
  1217. ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
  1218. if (nth >= nchunk0 * nchunk1) {
  1219. break;
  1220. }
  1221. current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
  1222. }
  1223. }
  1224. // ggml_compute_forward_mul_mat_id
  1225. #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)]
  1226. struct mmid_row_mapping {
  1227. int32_t i1;
  1228. int32_t i2;
  1229. };
  1230. static void ggml_compute_forward_mul_mat_id_one_chunk(
  1231. struct ggml_tensor * dst,
  1232. const struct ggml_tensor * src0,
  1233. const struct ggml_tensor * src1,
  1234. const struct ggml_tensor * ids,
  1235. const int64_t cur_a,
  1236. const int64_t ir0_start,
  1237. const int64_t ir0_end,
  1238. const int64_t ir1_start,
  1239. const int64_t ir1_end,
  1240. const char * src0_cur,
  1241. const struct mmid_row_mapping * matrix_rows,
  1242. const size_t row_size,
  1243. const bool src1_cont,
  1244. const void * wdata) {
  1245. GGML_TENSOR_BINARY_OP_LOCALS
  1246. const enum ggml_type type = src0->type;
  1247. ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
  1248. enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
  1249. const int64_t blck_0 = 16;
  1250. const int64_t blck_1 = 16;
  1251. float tmp[16];
  1252. for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
  1253. for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
  1254. for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
  1255. const int64_t _i12 = ir1; // logical row index for this expert
  1256. struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
  1257. const int id = row_mapping.i1; // selected expert index
  1258. const int64_t i11 = id % ne11;
  1259. const int64_t i12 = row_mapping.i2; // row index in src1
  1260. const int64_t i1 = id; // selected expert index
  1261. const int64_t i2 = i12; // row
  1262. // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
  1263. // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
  1264. // the original src1 data pointer, so we should index using the indices directly
  1265. // TODO: this is a bit of a hack, we should probably have a better way to handle this
  1266. const char * src1_col = (const char *) wdata +
  1267. (src1_cont || src1->type != vec_dot_type
  1268. ? (i11 + i12*ne11)*row_size
  1269. : (i11*nb11 + i12*nb12));
  1270. float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
  1271. for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
  1272. vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
  1273. }
  1274. memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
  1275. }
  1276. }
  1277. }
  1278. }
  1279. static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
  1280. void * ptr = *p;
  1281. ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
  1282. *p = (void *) ((char *) ptr + size);
  1283. return ptr;
  1284. }
  1285. static void ggml_compute_forward_mul_mat_id(
  1286. const struct ggml_compute_params * params,
  1287. struct ggml_tensor * dst) {
  1288. const struct ggml_tensor * src0 = dst->src[0];
  1289. const struct ggml_tensor * src1 = dst->src[1];
  1290. const struct ggml_tensor * ids = dst->src[2];
  1291. GGML_TENSOR_BINARY_OP_LOCALS
  1292. const int ith = params->ith;
  1293. const int nth = params->nth;
  1294. const enum ggml_type type = src0->type;
  1295. const bool src1_cont = ggml_is_contiguous(src1);
  1296. enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
  1297. ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
  1298. // we don't support permuted src0 or src1
  1299. GGML_ASSERT(nb00 == ggml_type_size(type));
  1300. GGML_ASSERT(nb10 == ggml_type_size(src1->type));
  1301. // dst cannot be transposed or permuted
  1302. GGML_ASSERT(nb0 == sizeof(float));
  1303. GGML_ASSERT(nb0 <= nb1);
  1304. GGML_ASSERT(nb1 <= nb2);
  1305. GGML_ASSERT(nb2 <= nb3);
  1306. // row groups
  1307. const int n_ids = ids->ne[0]; // n_expert_used
  1308. const int n_as = ne02; // n_expert
  1309. void * wdata_cur = params->wdata;
  1310. if (src1->type != vec_dot_type) {
  1311. incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
  1312. }
  1313. int64_t * matrix_row_counts = // [n_as]
  1314. incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));
  1315. struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]
  1316. incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));
  1317. char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]
  1318. incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);
  1319. GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata));
  1320. if (src1->type != vec_dot_type) {
  1321. char * wdata = params->wdata;
  1322. const size_t nbw0 = ggml_type_size(vec_dot_type);
  1323. const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
  1324. const size_t nbw2 = nbw1*ne11;
  1325. const size_t nbw3 = nbw2*ne12;
  1326. assert(params->wsize >= ne13*nbw3);
  1327. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  1328. #if 0
  1329. for (int64_t i13 = 0; i13 < ne13; ++i13) {
  1330. for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
  1331. for (int64_t i11 = 0; i11 < ne11; ++i11) {
  1332. from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
  1333. (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
  1334. ne10);
  1335. }
  1336. }
  1337. }
  1338. #else
  1339. for (int64_t i13 = 0; i13 < ne13; ++i13) {
  1340. for (int64_t i12 = 0; i12 < ne12; ++i12) {
  1341. for (int64_t i11 = 0; i11 < ne11; ++i11) {
  1342. size_t bs = ggml_blck_size(vec_dot_type);
  1343. int64_t ne10_block_start = (ith * ne10/bs) / nth;
  1344. int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
  1345. from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
  1346. (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
  1347. (ne10_block_end - ne10_block_start) * bs);
  1348. }
  1349. }
  1350. }
  1351. #endif
  1352. }
  1353. if (ith == 0) {
  1354. // initialize matrix_row_counts
  1355. memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
  1356. // group rows by src0 matrix
  1357. for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
  1358. for (int id = 0; id < n_ids; ++id) {
  1359. const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
  1360. assert(i02 >= 0 && i02 < n_as);
  1361. MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
  1362. matrix_row_counts[i02] += 1;
  1363. }
  1364. }
  1365. }
  1366. // reset current_chunk
  1367. for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
  1368. atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
  1369. *current_chunk_ctr = nth;
  1370. }
  1371. ggml_barrier(params->threadpool);
  1372. for (int cur_a = 0; cur_a < n_as; ++cur_a) {
  1373. const int64_t cne1 = matrix_row_counts[cur_a];
  1374. if (cne1 == 0) {
  1375. continue;
  1376. }
  1377. const char * src0_cur = (const char *) src0->data + cur_a * nb02;
  1378. const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
  1379. const size_t row_size = ggml_row_size(vec_dot_type, ne10);
  1380. const int64_t nr0 = ne01;
  1381. const int64_t nr1 = cne1;
  1382. int chunk_size = 16;
  1383. if (nr0 == 1 || nr1 == 1) {
  1384. chunk_size = 64;
  1385. }
  1386. #if defined(__aarch64__)
  1387. // disable for ARM
  1388. const bool disable_chunking = true;
  1389. #else
  1390. // disable for NUMA
  1391. const bool disable_chunking = ggml_is_numa();
  1392. #endif // defined(__aarch64__)
  1393. int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
  1394. int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
  1395. if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) {
  1396. nchunk0 = nr0 > nr1 ? nth : 1;
  1397. nchunk1 = nr0 > nr1 ? 1 : nth;
  1398. }
  1399. const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
  1400. const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
  1401. int current_chunk = ith;
  1402. atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
  1403. while (current_chunk < nchunk0 * nchunk1) {
  1404. const int64_t ith0 = current_chunk % nchunk0;
  1405. const int64_t ith1 = current_chunk / nchunk0;
  1406. const int64_t ir0_start = dr0 * ith0;
  1407. const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
  1408. const int64_t ir1_start = dr1 * ith1;
  1409. const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
  1410. ggml_compute_forward_mul_mat_id_one_chunk(
  1411. dst, src0, src1, ids, cur_a,
  1412. ir0_start, ir0_end, ir1_start, ir1_end,
  1413. src0_cur, matrix_rows, row_size, src1_cont, wdata
  1414. );
  1415. if (nth >= nchunk0 * nchunk1) {
  1416. break;
  1417. }
  1418. current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
  1419. }
  1420. }
  1421. }
  1422. /////////////////////////////////
  1423. static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
  1424. GGML_ASSERT(params);
  1425. if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
  1426. return;
  1427. }
  1428. // extra_buffer op?
  1429. if (ggml_cpu_extra_compute_forward(params, tensor)) {
  1430. return;
  1431. }
  1432. switch (tensor->op) {
  1433. case GGML_OP_DUP:
  1434. {
  1435. ggml_compute_forward_dup(params, tensor);
  1436. } break;
  1437. case GGML_OP_ADD:
  1438. {
  1439. ggml_compute_forward_add(params, tensor);
  1440. } break;
  1441. case GGML_OP_ADD_ID:
  1442. {
  1443. ggml_compute_forward_add_id(params, tensor);
  1444. } break;
  1445. case GGML_OP_ADD1:
  1446. {
  1447. ggml_compute_forward_add1(params, tensor);
  1448. } break;
  1449. case GGML_OP_ACC:
  1450. {
  1451. ggml_compute_forward_acc(params, tensor);
  1452. } break;
  1453. case GGML_OP_SUB:
  1454. {
  1455. ggml_compute_forward_sub(params, tensor);
  1456. } break;
  1457. case GGML_OP_MUL:
  1458. {
  1459. ggml_compute_forward_mul(params, tensor);
  1460. } break;
  1461. case GGML_OP_DIV:
  1462. {
  1463. ggml_compute_forward_div(params, tensor);
  1464. } break;
  1465. case GGML_OP_SQR:
  1466. {
  1467. ggml_compute_forward_sqr(params, tensor);
  1468. } break;
  1469. case GGML_OP_SQRT:
  1470. {
  1471. ggml_compute_forward_sqrt(params, tensor);
  1472. } break;
  1473. case GGML_OP_LOG:
  1474. {
  1475. ggml_compute_forward_log(params, tensor);
  1476. } break;
  1477. case GGML_OP_SIN:
  1478. {
  1479. ggml_compute_forward_sin(params, tensor);
  1480. } break;
  1481. case GGML_OP_COS:
  1482. {
  1483. ggml_compute_forward_cos(params, tensor);
  1484. } break;
  1485. case GGML_OP_SUM:
  1486. {
  1487. ggml_compute_forward_sum(params, tensor);
  1488. } break;
  1489. case GGML_OP_SUM_ROWS:
  1490. {
  1491. ggml_compute_forward_sum_rows(params, tensor);
  1492. } break;
  1493. case GGML_OP_MEAN:
  1494. {
  1495. ggml_compute_forward_mean(params, tensor);
  1496. } break;
  1497. case GGML_OP_ARGMAX:
  1498. {
  1499. ggml_compute_forward_argmax(params, tensor);
  1500. } break;
  1501. case GGML_OP_COUNT_EQUAL:
  1502. {
  1503. ggml_compute_forward_count_equal(params, tensor);
  1504. } break;
  1505. case GGML_OP_REPEAT:
  1506. {
  1507. ggml_compute_forward_repeat(params, tensor);
  1508. } break;
  1509. case GGML_OP_REPEAT_BACK:
  1510. {
  1511. ggml_compute_forward_repeat_back(params, tensor);
  1512. } break;
  1513. case GGML_OP_CONCAT:
  1514. {
  1515. ggml_compute_forward_concat(params, tensor);
  1516. } break;
  1517. case GGML_OP_SILU_BACK:
  1518. {
  1519. ggml_compute_forward_silu_back(params, tensor);
  1520. } break;
  1521. case GGML_OP_NORM:
  1522. {
  1523. ggml_compute_forward_norm(params, tensor);
  1524. } break;
  1525. case GGML_OP_RMS_NORM:
  1526. {
  1527. ggml_compute_forward_rms_norm(params, tensor);
  1528. } break;
  1529. case GGML_OP_RMS_NORM_BACK:
  1530. {
  1531. ggml_compute_forward_rms_norm_back(params, tensor);
  1532. } break;
  1533. case GGML_OP_GROUP_NORM:
  1534. {
  1535. ggml_compute_forward_group_norm(params, tensor);
  1536. } break;
  1537. case GGML_OP_L2_NORM:
  1538. {
  1539. ggml_compute_forward_l2_norm(params, tensor);
  1540. } break;
  1541. case GGML_OP_MUL_MAT:
  1542. {
  1543. ggml_compute_forward_mul_mat(params, tensor);
  1544. } break;
  1545. case GGML_OP_MUL_MAT_ID:
  1546. {
  1547. ggml_compute_forward_mul_mat_id(params, tensor);
  1548. } break;
  1549. case GGML_OP_OUT_PROD:
  1550. {
  1551. ggml_compute_forward_out_prod(params, tensor);
  1552. } break;
  1553. case GGML_OP_SCALE:
  1554. {
  1555. ggml_compute_forward_scale(params, tensor);
  1556. } break;
  1557. case GGML_OP_SET:
  1558. {
  1559. ggml_compute_forward_set(params, tensor);
  1560. } break;
  1561. case GGML_OP_CPY:
  1562. {
  1563. ggml_compute_forward_cpy(params, tensor);
  1564. } break;
  1565. case GGML_OP_CONT:
  1566. {
  1567. ggml_compute_forward_cont(params, tensor);
  1568. } break;
  1569. case GGML_OP_RESHAPE:
  1570. {
  1571. ggml_compute_forward_reshape(params, tensor);
  1572. } break;
  1573. case GGML_OP_VIEW:
  1574. {
  1575. ggml_compute_forward_view(params, tensor);
  1576. } break;
  1577. case GGML_OP_PERMUTE:
  1578. {
  1579. ggml_compute_forward_permute(params, tensor);
  1580. } break;
  1581. case GGML_OP_TRANSPOSE:
  1582. {
  1583. ggml_compute_forward_transpose(params, tensor);
  1584. } break;
  1585. case GGML_OP_GET_ROWS:
  1586. {
  1587. ggml_compute_forward_get_rows(params, tensor);
  1588. } break;
  1589. case GGML_OP_GET_ROWS_BACK:
  1590. {
  1591. ggml_compute_forward_get_rows_back(params, tensor);
  1592. } break;
  1593. case GGML_OP_SET_ROWS:
  1594. {
  1595. ggml_compute_forward_set_rows(params, tensor);
  1596. } break;
  1597. case GGML_OP_DIAG:
  1598. {
  1599. ggml_compute_forward_diag(params, tensor);
  1600. } break;
  1601. case GGML_OP_DIAG_MASK_INF:
  1602. {
  1603. ggml_compute_forward_diag_mask_inf(params, tensor);
  1604. } break;
  1605. case GGML_OP_DIAG_MASK_ZERO:
  1606. {
  1607. ggml_compute_forward_diag_mask_zero(params, tensor);
  1608. } break;
  1609. case GGML_OP_SOFT_MAX:
  1610. {
  1611. ggml_compute_forward_soft_max(params, tensor);
  1612. } break;
  1613. case GGML_OP_SOFT_MAX_BACK:
  1614. {
  1615. ggml_compute_forward_soft_max_ext_back(params, tensor);
  1616. } break;
  1617. case GGML_OP_ROPE:
  1618. {
  1619. ggml_compute_forward_rope(params, tensor);
  1620. } break;
  1621. case GGML_OP_ROPE_BACK:
  1622. {
  1623. ggml_compute_forward_rope_back(params, tensor);
  1624. } break;
  1625. case GGML_OP_CLAMP:
  1626. {
  1627. ggml_compute_forward_clamp(params, tensor);
  1628. } break;
  1629. case GGML_OP_CONV_TRANSPOSE_1D:
  1630. {
  1631. ggml_compute_forward_conv_transpose_1d(params, tensor);
  1632. } break;
  1633. case GGML_OP_IM2COL:
  1634. {
  1635. ggml_compute_forward_im2col(params, tensor);
  1636. } break;
  1637. case GGML_OP_IM2COL_BACK:
  1638. {
  1639. ggml_compute_forward_im2col_back_f32(params, tensor);
  1640. } break;
  1641. case GGML_OP_IM2COL_3D:
  1642. {
  1643. ggml_compute_forward_im2col_3d(params, tensor);
  1644. } break;
  1645. case GGML_OP_CONV_2D:
  1646. {
  1647. ggml_compute_forward_conv_2d(params, tensor);
  1648. } break;
  1649. case GGML_OP_CONV_3D:
  1650. {
  1651. ggml_compute_forward_conv_3d(params, tensor);
  1652. } break;
  1653. case GGML_OP_CONV_2D_DW:
  1654. {
  1655. ggml_compute_forward_conv_2d_dw(params, tensor);
  1656. } break;
  1657. case GGML_OP_CONV_TRANSPOSE_2D:
  1658. {
  1659. ggml_compute_forward_conv_transpose_2d(params, tensor);
  1660. } break;
  1661. case GGML_OP_POOL_1D:
  1662. {
  1663. ggml_compute_forward_pool_1d(params, tensor);
  1664. } break;
  1665. case GGML_OP_POOL_2D:
  1666. {
  1667. ggml_compute_forward_pool_2d(params, tensor);
  1668. } break;
  1669. case GGML_OP_POOL_2D_BACK:
  1670. {
  1671. ggml_compute_forward_pool_2d_back(params, tensor);
  1672. } break;
  1673. case GGML_OP_UPSCALE:
  1674. {
  1675. ggml_compute_forward_upscale(params, tensor);
  1676. } break;
  1677. case GGML_OP_PAD:
  1678. {
  1679. ggml_compute_forward_pad(params, tensor);
  1680. } break;
  1681. case GGML_OP_PAD_REFLECT_1D:
  1682. {
  1683. ggml_compute_forward_pad_reflect_1d(params, tensor);
  1684. } break;
  1685. case GGML_OP_ROLL:
  1686. {
  1687. ggml_compute_forward_roll(params, tensor);
  1688. } break;
  1689. case GGML_OP_ARANGE:
  1690. {
  1691. ggml_compute_forward_arange(params, tensor);
  1692. } break;
  1693. case GGML_OP_TIMESTEP_EMBEDDING:
  1694. {
  1695. ggml_compute_forward_timestep_embedding(params, tensor);
  1696. } break;
  1697. case GGML_OP_ARGSORT:
  1698. {
  1699. ggml_compute_forward_argsort(params, tensor);
  1700. } break;
  1701. case GGML_OP_LEAKY_RELU:
  1702. {
  1703. ggml_compute_forward_leaky_relu(params, tensor);
  1704. } break;
  1705. case GGML_OP_FLASH_ATTN_EXT:
  1706. {
  1707. ggml_compute_forward_flash_attn_ext(params, tensor);
  1708. } break;
  1709. case GGML_OP_FLASH_ATTN_BACK:
  1710. {
  1711. int32_t t = ggml_get_op_params_i32(tensor, 0);
  1712. GGML_ASSERT(t == 0 || t == 1);
  1713. bool masked = t != 0;
  1714. ggml_compute_forward_flash_attn_back(params, masked, tensor);
  1715. } break;
  1716. case GGML_OP_SSM_CONV:
  1717. {
  1718. ggml_compute_forward_ssm_conv(params, tensor);
  1719. } break;
  1720. case GGML_OP_SSM_SCAN:
  1721. {
  1722. ggml_compute_forward_ssm_scan(params, tensor);
  1723. } break;
  1724. case GGML_OP_WIN_PART:
  1725. {
  1726. ggml_compute_forward_win_part(params, tensor);
  1727. } break;
  1728. case GGML_OP_WIN_UNPART:
  1729. {
  1730. ggml_compute_forward_win_unpart(params, tensor);
  1731. } break;
  1732. case GGML_OP_UNARY:
  1733. {
  1734. ggml_compute_forward_unary(params, tensor);
  1735. } break;
  1736. case GGML_OP_GLU:
  1737. {
  1738. ggml_compute_forward_glu(params, tensor);
  1739. } break;
  1740. case GGML_OP_GET_REL_POS:
  1741. {
  1742. ggml_compute_forward_get_rel_pos(params, tensor);
  1743. } break;
  1744. case GGML_OP_ADD_REL_POS:
  1745. {
  1746. ggml_compute_forward_add_rel_pos(params, tensor);
  1747. } break;
  1748. case GGML_OP_RWKV_WKV6:
  1749. {
  1750. ggml_compute_forward_rwkv_wkv6(params, tensor);
  1751. } break;
  1752. case GGML_OP_GATED_LINEAR_ATTN:
  1753. {
  1754. ggml_compute_forward_gla(params, tensor);
  1755. } break;
  1756. case GGML_OP_RWKV_WKV7:
  1757. {
  1758. ggml_compute_forward_rwkv_wkv7(params, tensor);
  1759. } break;
  1760. case GGML_OP_MAP_CUSTOM1:
  1761. {
  1762. ggml_compute_forward_map_custom1(params, tensor);
  1763. }
  1764. break;
  1765. case GGML_OP_MAP_CUSTOM2:
  1766. {
  1767. ggml_compute_forward_map_custom2(params, tensor);
  1768. }
  1769. break;
  1770. case GGML_OP_MAP_CUSTOM3:
  1771. {
  1772. ggml_compute_forward_map_custom3(params, tensor);
  1773. }
  1774. break;
  1775. case GGML_OP_CUSTOM:
  1776. {
  1777. ggml_compute_forward_custom(params, tensor);
  1778. }
  1779. break;
  1780. case GGML_OP_CROSS_ENTROPY_LOSS:
  1781. {
  1782. ggml_compute_forward_cross_entropy_loss(params, tensor);
  1783. }
  1784. break;
  1785. case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
  1786. {
  1787. ggml_compute_forward_cross_entropy_loss_back(params, tensor);
  1788. }
  1789. break;
  1790. case GGML_OP_OPT_STEP_ADAMW:
  1791. {
  1792. ggml_compute_forward_opt_step_adamw(params, tensor);
  1793. }
  1794. break;
  1795. case GGML_OP_OPT_STEP_SGD:
  1796. {
  1797. ggml_compute_forward_opt_step_sgd(params, tensor);
  1798. }
  1799. break;
  1800. case GGML_OP_NONE:
  1801. {
  1802. // nop
  1803. } break;
  1804. case GGML_OP_COUNT:
  1805. {
  1806. GGML_ABORT("fatal error");
  1807. }
  1808. }
  1809. }
  1810. // Android's libc implementation "bionic" does not support setting affinity
  1811. #if defined(__gnu_linux__)
  1812. static void set_numa_thread_affinity(int thread_n) {
  1813. if (!ggml_is_numa()) {
  1814. return;
  1815. }
  1816. int node_num;
  1817. int rv;
  1818. size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
  1819. switch(g_state.numa.numa_strategy) {
  1820. case GGML_NUMA_STRATEGY_DISTRIBUTE:
  1821. // run thread on node_num thread_n / (threads per node)
  1822. node_num = thread_n % g_state.numa.n_nodes;
  1823. break;
  1824. case GGML_NUMA_STRATEGY_ISOLATE:
  1825. // run thread on current_node
  1826. node_num = g_state.numa.current_node;
  1827. break;
  1828. case GGML_NUMA_STRATEGY_NUMACTL:
  1829. // use the cpuset that numactl gave us
  1830. rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
  1831. if (rv) {
  1832. fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
  1833. }
  1834. return;
  1835. default:
  1836. return;
  1837. }
  1838. struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
  1839. cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
  1840. CPU_ZERO_S(setsize, cpus);
  1841. for (size_t i = 0; i < node->n_cpus; ++i) {
  1842. CPU_SET_S(node->cpus[i], setsize, cpus);
  1843. }
  1844. rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
  1845. if (rv) {
  1846. fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
  1847. }
  1848. CPU_FREE(cpus);
  1849. }
  1850. static void clear_numa_thread_affinity(void) {
  1851. if (!ggml_is_numa()) {
  1852. return;
  1853. }
  1854. size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
  1855. cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
  1856. CPU_ZERO_S(setsize, cpus);
  1857. for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
  1858. CPU_SET_S(i, setsize, cpus);
  1859. }
  1860. int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
  1861. if (rv) {
  1862. fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
  1863. }
  1864. CPU_FREE(cpus);
  1865. }
  1866. #else
  1867. // TODO: Windows etc.
  1868. // (the linux implementation may also work on BSD, someone should test)
  1869. static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
  1870. static void clear_numa_thread_affinity(void) {}
  1871. #endif
  1872. static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
  1873. int n_tasks = 0;
  1874. if (ggml_is_empty(node)) {
  1875. // no need to multi-thread a no-op
  1876. n_tasks = 1;
  1877. return n_tasks;
  1878. }
  1879. switch (node->op) {
  1880. case GGML_OP_CPY:
  1881. case GGML_OP_DUP:
  1882. case GGML_OP_CONT:
  1883. case GGML_OP_ADD:
  1884. case GGML_OP_ADD_ID:
  1885. case GGML_OP_ADD1:
  1886. case GGML_OP_ACC:
  1887. {
  1888. n_tasks = n_threads;
  1889. } break;
  1890. case GGML_OP_SUB:
  1891. case GGML_OP_SQR:
  1892. case GGML_OP_SQRT:
  1893. case GGML_OP_LOG:
  1894. case GGML_OP_SIN:
  1895. case GGML_OP_COS:
  1896. case GGML_OP_SUM:
  1897. case GGML_OP_SUM_ROWS:
  1898. case GGML_OP_MEAN:
  1899. case GGML_OP_ARGMAX:
  1900. {
  1901. n_tasks = 1;
  1902. } break;
  1903. case GGML_OP_COUNT_EQUAL:
  1904. {
  1905. n_tasks = n_threads;
  1906. } break;
  1907. case GGML_OP_REPEAT:
  1908. case GGML_OP_REPEAT_BACK:
  1909. case GGML_OP_LEAKY_RELU:
  1910. {
  1911. n_tasks = 1;
  1912. } break;
  1913. case GGML_OP_UNARY:
  1914. switch (ggml_get_unary_op(node)) {
  1915. case GGML_UNARY_OP_ABS:
  1916. case GGML_UNARY_OP_SGN:
  1917. case GGML_UNARY_OP_NEG:
  1918. case GGML_UNARY_OP_STEP:
  1919. case GGML_UNARY_OP_TANH:
  1920. case GGML_UNARY_OP_ELU:
  1921. case GGML_UNARY_OP_RELU:
  1922. case GGML_UNARY_OP_SIGMOID:
  1923. case GGML_UNARY_OP_HARDSWISH:
  1924. case GGML_UNARY_OP_HARDSIGMOID:
  1925. case GGML_UNARY_OP_EXP:
  1926. {
  1927. n_tasks = 1;
  1928. } break;
  1929. case GGML_UNARY_OP_GELU:
  1930. case GGML_UNARY_OP_GELU_ERF:
  1931. case GGML_UNARY_OP_GELU_QUICK:
  1932. case GGML_UNARY_OP_SILU:
  1933. {
  1934. n_tasks = n_threads;
  1935. } break;
  1936. default:
  1937. GGML_ABORT("fatal error");
  1938. }
  1939. break;
  1940. case GGML_OP_GLU:
  1941. switch (ggml_get_glu_op(node)) {
  1942. case GGML_GLU_OP_REGLU:
  1943. case GGML_GLU_OP_GEGLU:
  1944. case GGML_GLU_OP_SWIGLU:
  1945. case GGML_GLU_OP_SWIGLU_OAI:
  1946. case GGML_GLU_OP_GEGLU_ERF:
  1947. case GGML_GLU_OP_GEGLU_QUICK:
  1948. {
  1949. n_tasks = n_threads;
  1950. } break;
  1951. default:
  1952. GGML_ABORT("fatal error");
  1953. }
  1954. break;
  1955. case GGML_OP_SILU_BACK:
  1956. case GGML_OP_MUL:
  1957. case GGML_OP_DIV:
  1958. case GGML_OP_NORM:
  1959. case GGML_OP_RMS_NORM:
  1960. case GGML_OP_RMS_NORM_BACK:
  1961. case GGML_OP_L2_NORM:
  1962. case GGML_OP_GROUP_NORM:
  1963. case GGML_OP_CONCAT:
  1964. case GGML_OP_MUL_MAT:
  1965. case GGML_OP_MUL_MAT_ID:
  1966. case GGML_OP_OUT_PROD:
  1967. {
  1968. n_tasks = n_threads;
  1969. } break;
  1970. case GGML_OP_GET_ROWS:
  1971. case GGML_OP_SET_ROWS:
  1972. {
  1973. // FIXME: get_rows can use additional threads, but the cost of launching additional threads
  1974. // decreases performance with GPU offloading
  1975. //n_tasks = n_threads;
  1976. n_tasks = 1;
  1977. } break;
  1978. case GGML_OP_SCALE:
  1979. case GGML_OP_SET:
  1980. case GGML_OP_RESHAPE:
  1981. case GGML_OP_VIEW:
  1982. case GGML_OP_PERMUTE:
  1983. case GGML_OP_TRANSPOSE:
  1984. case GGML_OP_GET_ROWS_BACK:
  1985. case GGML_OP_DIAG:
  1986. {
  1987. n_tasks = 1;
  1988. } break;
  1989. case GGML_OP_DIAG_MASK_ZERO:
  1990. case GGML_OP_DIAG_MASK_INF:
  1991. case GGML_OP_SOFT_MAX_BACK:
  1992. case GGML_OP_ROPE:
  1993. case GGML_OP_ROPE_BACK:
  1994. case GGML_OP_ADD_REL_POS:
  1995. {
  1996. n_tasks = n_threads;
  1997. } break;
  1998. case GGML_OP_CLAMP:
  1999. {
  2000. n_tasks = 1; //TODO
  2001. } break;
  2002. case GGML_OP_SOFT_MAX:
  2003. {
  2004. n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
  2005. } break;
  2006. case GGML_OP_IM2COL:
  2007. case GGML_OP_IM2COL_BACK:
  2008. case GGML_OP_IM2COL_3D:
  2009. case GGML_OP_CONV_2D:
  2010. case GGML_OP_CONV_3D:
  2011. case GGML_OP_CONV_2D_DW:
  2012. case GGML_OP_CONV_TRANSPOSE_1D:
  2013. case GGML_OP_CONV_TRANSPOSE_2D:
  2014. {
  2015. n_tasks = n_threads;
  2016. } break;
  2017. case GGML_OP_POOL_1D:
  2018. case GGML_OP_POOL_2D:
  2019. case GGML_OP_POOL_2D_BACK:
  2020. {
  2021. n_tasks = 1;
  2022. } break;
  2023. case GGML_OP_UPSCALE:
  2024. case GGML_OP_PAD:
  2025. case GGML_OP_PAD_REFLECT_1D:
  2026. case GGML_OP_ROLL:
  2027. case GGML_OP_ARANGE:
  2028. case GGML_OP_TIMESTEP_EMBEDDING:
  2029. case GGML_OP_ARGSORT:
  2030. case GGML_OP_FLASH_ATTN_EXT:
  2031. case GGML_OP_FLASH_ATTN_BACK:
  2032. case GGML_OP_SSM_CONV:
  2033. case GGML_OP_SSM_SCAN:
  2034. case GGML_OP_RWKV_WKV6:
  2035. case GGML_OP_GATED_LINEAR_ATTN:
  2036. case GGML_OP_RWKV_WKV7:
  2037. {
  2038. n_tasks = n_threads;
  2039. } break;
  2040. case GGML_OP_WIN_PART:
  2041. case GGML_OP_WIN_UNPART:
  2042. case GGML_OP_GET_REL_POS:
  2043. {
  2044. n_tasks = 1;
  2045. } break;
  2046. case GGML_OP_MAP_CUSTOM1:
  2047. {
  2048. struct ggml_map_custom1_op_params p;
  2049. memcpy(&p, node->op_params, sizeof(p));
  2050. if (p.n_tasks == GGML_N_TASKS_MAX) {
  2051. n_tasks = n_threads;
  2052. } else {
  2053. n_tasks = MIN(p.n_tasks, n_threads);
  2054. }
  2055. } break;
  2056. case GGML_OP_MAP_CUSTOM2:
  2057. {
  2058. struct ggml_map_custom2_op_params p;
  2059. memcpy(&p, node->op_params, sizeof(p));
  2060. if (p.n_tasks == GGML_N_TASKS_MAX) {
  2061. n_tasks = n_threads;
  2062. } else {
  2063. n_tasks = MIN(p.n_tasks, n_threads);
  2064. }
  2065. } break;
  2066. case GGML_OP_MAP_CUSTOM3:
  2067. {
  2068. struct ggml_map_custom3_op_params p;
  2069. memcpy(&p, node->op_params, sizeof(p));
  2070. if (p.n_tasks == GGML_N_TASKS_MAX) {
  2071. n_tasks = n_threads;
  2072. } else {
  2073. n_tasks = MIN(p.n_tasks, n_threads);
  2074. }
  2075. } break;
  2076. case GGML_OP_CUSTOM:
  2077. {
  2078. struct ggml_custom_op_params p;
  2079. memcpy(&p, node->op_params, sizeof(p));
  2080. if (p.n_tasks == GGML_N_TASKS_MAX) {
  2081. n_tasks = n_threads;
  2082. } else {
  2083. n_tasks = MIN(p.n_tasks, n_threads);
  2084. }
  2085. } break;
  2086. case GGML_OP_CROSS_ENTROPY_LOSS:
  2087. case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
  2088. case GGML_OP_OPT_STEP_ADAMW:
  2089. case GGML_OP_OPT_STEP_SGD:
  2090. {
  2091. n_tasks = n_threads;
  2092. } break;
  2093. case GGML_OP_NONE:
  2094. {
  2095. n_tasks = 1;
  2096. } break;
  2097. case GGML_OP_COUNT:
  2098. {
  2099. GGML_ABORT("fatal error");
  2100. }
  2101. default:
  2102. {
  2103. fprintf(stderr, "%s: op not implemented: ", __func__);
  2104. if (node->op < GGML_OP_COUNT) {
  2105. fprintf(stderr, "%s\n", ggml_op_name(node->op));
  2106. } else {
  2107. fprintf(stderr, "%d\n", node->op);
  2108. }
  2109. GGML_ABORT("fatal error");
  2110. }
  2111. }
  2112. assert(n_tasks > 0);
  2113. return n_tasks;
  2114. }
  2115. static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
  2116. #if defined(_WIN32)
  2117. #include "windows.h"
  2118. // TODO: support > 64 CPUs
  2119. static bool ggml_thread_apply_affinity(bool * mask) {
  2120. HANDLE h = GetCurrentThread();
  2121. uint64_t bitmask = 0ULL;
  2122. assert(GGML_MAX_N_THREADS >= 64);
  2123. for (int32_t i = 0; i < 8; i++) {
  2124. int32_t idx = i * 8;
  2125. uint8_t val = 0;
  2126. val |= mask[idx + 0] << 0;
  2127. val |= mask[idx + 1] << 1;
  2128. val |= mask[idx + 2] << 2;
  2129. val |= mask[idx + 3] << 3;
  2130. val |= mask[idx + 4] << 4;
  2131. val |= mask[idx + 5] << 5;
  2132. val |= mask[idx + 6] << 6;
  2133. val |= mask[idx + 7] << 7;
  2134. bitmask |= (uint64_t)val << idx;
  2135. }
  2136. for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
  2137. if (mask[i]) {
  2138. fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
  2139. break;
  2140. }
  2141. }
  2142. DWORD_PTR m = (DWORD_PTR)bitmask;
  2143. m = SetThreadAffinityMask(h, m);
  2144. return m != 0;
  2145. }
  2146. static bool ggml_thread_apply_priority(int32_t prio) {
  2147. // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
  2148. // This is up to the applications.
  2149. DWORD p = THREAD_PRIORITY_NORMAL;
  2150. switch (prio) {
  2151. case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
  2152. case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
  2153. case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
  2154. case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
  2155. case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
  2156. }
  2157. if (prio != GGML_SCHED_PRIO_LOW) {
  2158. // Tell Windows that this thread should not be throttled (needs its own CPU core).
  2159. // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
  2160. // all our threads onto the first 4 cores which results in terrible performance with
  2161. // n_threads > 4
  2162. #if _WIN32_WINNT >= 0x0602
  2163. THREAD_POWER_THROTTLING_STATE t;
  2164. ZeroMemory(&t, sizeof(t));
  2165. t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
  2166. t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
  2167. t.StateMask = 0;
  2168. if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
  2169. GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
  2170. return false;
  2171. }
  2172. #endif
  2173. }
  2174. if (prio == GGML_SCHED_PRIO_NORMAL) {
  2175. // Keep inherited policy/priority
  2176. return true;
  2177. }
  2178. if (!SetThreadPriority(GetCurrentThread(), p)) {
  2179. fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
  2180. return false;
  2181. }
  2182. return true;
  2183. }
  2184. #elif defined(__APPLE__)
  2185. #include <sys/types.h>
  2186. #include <sys/resource.h>
  2187. static bool ggml_thread_apply_affinity(const bool * mask) {
  2188. // Not supported on Apple platforms
  2189. UNUSED(mask);
  2190. return true;
  2191. }
  2192. static bool ggml_thread_apply_priority(int32_t prio) {
  2193. struct sched_param p;
  2194. int32_t policy = SCHED_OTHER;
  2195. switch (prio) {
  2196. // TODO: there seems to be no way to set lower prio on Apple platforms
  2197. case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
  2198. case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
  2199. case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
  2200. case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
  2201. case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break;
  2202. }
  2203. if (prio == GGML_SCHED_PRIO_NORMAL) {
  2204. // Keep inherited policy/priority
  2205. return true;
  2206. }
  2207. int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
  2208. if (err != 0) {
  2209. fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
  2210. return false;
  2211. }
  2212. return true;
  2213. }
  2214. #elif defined(__gnu_linux__)
  2215. // TODO: this may not work on BSD, to be verified
  2216. static bool ggml_thread_apply_affinity(const bool * mask) {
  2217. cpu_set_t cpuset;
  2218. int err;
  2219. CPU_ZERO(&cpuset);
  2220. for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
  2221. if (mask[i]) {
  2222. GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
  2223. CPU_SET(i, &cpuset);
  2224. }
  2225. }
  2226. #ifdef __ANDROID__
  2227. err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
  2228. if (err < 0) {
  2229. err = errno;
  2230. }
  2231. #else
  2232. err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
  2233. #endif
  2234. if (err != 0) {
  2235. fprintf(stderr, "warn: failed to set affinity mask 0x%llx : %s (%d)\n", (unsigned long long)mask, strerror(err), err);
  2236. return false;
  2237. }
  2238. return true;
  2239. }
  2240. static bool ggml_thread_apply_priority(int32_t prio) {
  2241. struct sched_param p;
  2242. int32_t policy = SCHED_OTHER;
  2243. switch (prio) {
  2244. case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
  2245. case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
  2246. case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
  2247. case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
  2248. case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break;
  2249. }
  2250. if (prio == GGML_SCHED_PRIO_NORMAL) {
  2251. // Keep inherited policy/priority
  2252. return true;
  2253. }
  2254. int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
  2255. if (err != 0) {
  2256. fprintf(stderr, "warn: failed to set thread priority %d : %s (%d)\n", prio, strerror(err), err);
  2257. return false;
  2258. }
  2259. return true;
  2260. }
  2261. #else // unsupported platforms
  2262. static bool ggml_thread_apply_affinity(const bool * mask) {
  2263. UNUSED(mask);
  2264. return true;
  2265. }
  2266. static bool ggml_thread_apply_priority(int32_t prio) {
  2267. UNUSED(prio);
  2268. return true;
  2269. }
  2270. #endif
  2271. static bool ggml_thread_cpumask_is_valid(const bool * mask) {
  2272. for (int i = 0; i < GGML_MAX_N_THREADS; i++) {
  2273. if (mask[i]) { return true; }
  2274. }
  2275. return false;
  2276. }
  2277. static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
  2278. if (!strict) {
  2279. memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
  2280. return;
  2281. } else {
  2282. memset(local_mask, 0, GGML_MAX_N_THREADS);
  2283. int32_t base_idx = *iter;
  2284. for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
  2285. int32_t idx = base_idx + i;
  2286. if (idx >= GGML_MAX_N_THREADS) {
  2287. // Just a cheaper modulo
  2288. idx -= GGML_MAX_N_THREADS;
  2289. }
  2290. if (global_mask[idx]) {
  2291. local_mask[idx] = 1;
  2292. *iter = idx + 1;
  2293. return;
  2294. }
  2295. }
  2296. }
  2297. }
  2298. void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
  2299. if (!threadpool) return;
  2300. const int n_threads = threadpool->n_threads_max;
  2301. #ifndef GGML_USE_OPENMP
  2302. struct ggml_compute_state* workers = threadpool->workers;
  2303. ggml_mutex_lock(&threadpool->mutex);
  2304. threadpool->stop = true;
  2305. threadpool->pause = false;
  2306. ggml_cond_broadcast(&threadpool->cond);
  2307. ggml_mutex_unlock(&threadpool->mutex);
  2308. for (int j = 1; j < n_threads; j++) {
  2309. int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
  2310. GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
  2311. UNUSED(rc);
  2312. }
  2313. ggml_mutex_destroy(&threadpool->mutex);
  2314. ggml_cond_destroy(&threadpool->cond);
  2315. #endif // GGML_USE_OPENMP
  2316. const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
  2317. ggml_aligned_free(threadpool->workers, workers_size);
  2318. ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
  2319. }
  2320. #ifndef GGML_USE_OPENMP
  2321. // pause/resume must be called under mutex
  2322. static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) {
  2323. GGML_PRINT_DEBUG("Pausing threadpool\n");
  2324. threadpool->pause = true;
  2325. ggml_cond_broadcast(&threadpool->cond);
  2326. }
  2327. static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) {
  2328. GGML_PRINT_DEBUG("Resuming threadpool\n");
  2329. threadpool->pause = false;
  2330. ggml_cond_broadcast(&threadpool->cond);
  2331. }
  2332. #endif
  2333. void ggml_threadpool_pause(struct ggml_threadpool * threadpool) {
  2334. #ifndef GGML_USE_OPENMP
  2335. ggml_mutex_lock(&threadpool->mutex);
  2336. if (!threadpool->pause) {
  2337. ggml_threadpool_pause_locked(threadpool);
  2338. }
  2339. ggml_mutex_unlock(&threadpool->mutex);
  2340. #else
  2341. UNUSED(threadpool);
  2342. #endif
  2343. }
  2344. void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
  2345. #ifndef GGML_USE_OPENMP
  2346. ggml_mutex_lock(&threadpool->mutex);
  2347. if (threadpool->pause) {
  2348. ggml_threadpool_resume_locked(threadpool);
  2349. }
  2350. ggml_mutex_unlock(&threadpool->mutex);
  2351. #else
  2352. UNUSED(threadpool);
  2353. #endif
  2354. }
  2355. struct ggml_cplan ggml_graph_plan(
  2356. const struct ggml_cgraph * cgraph,
  2357. int n_threads,
  2358. struct ggml_threadpool * threadpool) {
  2359. if (threadpool == NULL) {
  2360. //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
  2361. }
  2362. if (n_threads <= 0) {
  2363. n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
  2364. }
  2365. size_t work_size = 0;
  2366. struct ggml_cplan cplan;
  2367. memset(&cplan, 0, sizeof(struct ggml_cplan));
  2368. int max_tasks = 1;
  2369. // thread scheduling for the different operations + work buffer size estimation
  2370. for (int i = 0; i < cgraph->n_nodes; i++) {
  2371. struct ggml_tensor * node = cgraph->nodes[i];
  2372. const int n_tasks = ggml_get_n_tasks(node, n_threads);
  2373. max_tasks = MAX(max_tasks, n_tasks);
  2374. size_t cur = 0;
  2375. if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {
  2376. switch (node->op) {
  2377. case GGML_OP_CPY:
  2378. case GGML_OP_DUP:
  2379. {
  2380. if (ggml_is_quantized(node->type) ||
  2381. // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
  2382. (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
  2383. (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
  2384. // conversion between F32 and I32
  2385. (node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
  2386. (node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
  2387. cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
  2388. }
  2389. } break;
  2390. case GGML_OP_ADD:
  2391. case GGML_OP_ADD_ID:
  2392. case GGML_OP_ADD1:
  2393. {
  2394. if (ggml_is_quantized(node->src[0]->type)) {
  2395. cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
  2396. }
  2397. } break;
  2398. case GGML_OP_ACC:
  2399. {
  2400. if (ggml_is_quantized(node->src[0]->type)) {
  2401. cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
  2402. }
  2403. } break;
  2404. case GGML_OP_COUNT_EQUAL:
  2405. {
  2406. cur = ggml_type_size(node->type)*n_tasks;
  2407. } break;
  2408. case GGML_OP_MUL_MAT:
  2409. {
  2410. const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
  2411. if (node->src[1]->type != vec_dot_type) {
  2412. cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
  2413. }
  2414. } break;
  2415. case GGML_OP_MUL_MAT_ID:
  2416. {
  2417. cur = 0;
  2418. const struct ggml_tensor * src0 = node->src[0];
  2419. const struct ggml_tensor * src1 = node->src[1];
  2420. const struct ggml_tensor * ids = node->src[2];
  2421. const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
  2422. const int n_as = src0->ne[2];
  2423. // src1
  2424. if (src1->type != vec_dot_type) {
  2425. cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t);
  2426. }
  2427. // matrix_row_counts
  2428. cur += n_as * sizeof(int64_t) + sizeof(int64_t);
  2429. // matrix_rows
  2430. cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);
  2431. // atomic_current_chunk
  2432. cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;
  2433. } break;
  2434. case GGML_OP_OUT_PROD:
  2435. {
  2436. if (ggml_is_quantized(node->src[0]->type)) {
  2437. cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
  2438. }
  2439. } break;
  2440. case GGML_OP_SOFT_MAX:
  2441. case GGML_OP_ROPE:
  2442. case GGML_OP_ROPE_BACK:
  2443. {
  2444. cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
  2445. } break;
  2446. case GGML_OP_CONV_TRANSPOSE_1D:
  2447. {
  2448. GGML_ASSERT(node->src[0]->ne[3] == 1);
  2449. GGML_ASSERT(node->src[1]->ne[2] == 1);
  2450. GGML_ASSERT(node->src[1]->ne[3] == 1);
  2451. const int64_t ne00 = node->src[0]->ne[0]; // K
  2452. const int64_t ne01 = node->src[0]->ne[1]; // Cout
  2453. const int64_t ne02 = node->src[0]->ne[2]; // Cin
  2454. const int64_t ne10 = node->src[1]->ne[0]; // L
  2455. const int64_t ne11 = node->src[1]->ne[1]; // Cin
  2456. if ((node->src[0]->type == GGML_TYPE_F16 ||
  2457. node->src[0]->type == GGML_TYPE_BF16) &&
  2458. node->src[1]->type == GGML_TYPE_F32) {
  2459. cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
  2460. cur += sizeof(ggml_fp16_t)*ne10*ne11;
  2461. } else if (node->src[0]->type == GGML_TYPE_F32 &&
  2462. node->src[1]->type == GGML_TYPE_F32) {
  2463. cur += sizeof(float)*ne00*ne01*ne02;
  2464. cur += sizeof(float)*ne10*ne11;
  2465. } else {
  2466. GGML_ABORT("fatal error");
  2467. }
  2468. } break;
  2469. case GGML_OP_CONV_2D:
  2470. case GGML_OP_CONV_3D:
  2471. {
  2472. cur = GGML_IM2COL_WORK_SIZE;
  2473. } break;
  2474. case GGML_OP_CONV_TRANSPOSE_2D:
  2475. {
  2476. const int64_t ne00 = node->src[0]->ne[0]; // W
  2477. const int64_t ne01 = node->src[0]->ne[1]; // H
  2478. const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
  2479. const int64_t ne03 = node->src[0]->ne[3]; // Channels In
  2480. const int64_t ne10 = node->src[1]->ne[0]; // W
  2481. const int64_t ne11 = node->src[1]->ne[1]; // H
  2482. const int64_t ne12 = node->src[1]->ne[2]; // Channels In
  2483. cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
  2484. cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
  2485. } break;
  2486. case GGML_OP_FLASH_ATTN_EXT:
  2487. {
  2488. const int64_t ne10 = node->src[1]->ne[0]; // DK
  2489. const int64_t ne20 = node->src[2]->ne[0]; // DV
  2490. cur = sizeof(float)*(1*ne10 + 2*ne20)*n_tasks; // 1x head size K + 2x head size V (per thread)
  2491. } break;
  2492. case GGML_OP_FLASH_ATTN_BACK:
  2493. {
  2494. const int64_t D = node->src[0]->ne[0];
  2495. const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
  2496. const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
  2497. if (node->src[1]->type == GGML_TYPE_F32) {
  2498. cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
  2499. cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
  2500. } else if (node->src[1]->type == GGML_TYPE_F16) {
  2501. cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
  2502. cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
  2503. } else if (node->src[1]->type == GGML_TYPE_BF16) {
  2504. cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
  2505. cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
  2506. }
  2507. } break;
  2508. case GGML_OP_CROSS_ENTROPY_LOSS:
  2509. {
  2510. cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
  2511. } break;
  2512. case GGML_OP_COUNT:
  2513. {
  2514. GGML_ABORT("fatal error");
  2515. }
  2516. default:
  2517. break;
  2518. }
  2519. }
  2520. work_size = MAX(work_size, cur);
  2521. }
  2522. if (work_size > 0) {
  2523. work_size += CACHE_LINE_SIZE*(n_threads);
  2524. }
  2525. cplan.threadpool = threadpool;
  2526. cplan.n_threads = MIN(max_tasks, n_threads);
  2527. cplan.work_size = work_size;
  2528. cplan.work_data = NULL;
  2529. return cplan;
  2530. }
  2531. static thread_ret_t ggml_graph_compute_thread(void * data) {
  2532. struct ggml_compute_state * state = (struct ggml_compute_state *) data;
  2533. struct ggml_threadpool * tp = state->threadpool;
  2534. const struct ggml_cgraph * cgraph = tp->cgraph;
  2535. const struct ggml_cplan * cplan = tp->cplan;
  2536. set_numa_thread_affinity(state->ith);
  2537. struct ggml_compute_params params = {
  2538. /*.ith =*/ state->ith,
  2539. /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
  2540. /*.wsize =*/ cplan->work_size,
  2541. /*.wdata =*/ cplan->work_data,
  2542. /*.threadpool=*/ tp,
  2543. };
  2544. for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
  2545. struct ggml_tensor * node = cgraph->nodes[node_n];
  2546. ggml_compute_forward(&params, node);
  2547. if (state->ith == 0 && cplan->abort_callback &&
  2548. cplan->abort_callback(cplan->abort_callback_data)) {
  2549. atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
  2550. tp->ec = GGML_STATUS_ABORTED;
  2551. }
  2552. if (node_n + 1 < cgraph->n_nodes) {
  2553. ggml_barrier(state->threadpool);
  2554. }
  2555. }
  2556. ggml_barrier(state->threadpool);
  2557. return 0;
  2558. }
  2559. #ifndef GGML_USE_OPENMP
  2560. // check if thread is active
  2561. static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
  2562. struct ggml_threadpool * threadpool = state->threadpool;
  2563. int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
  2564. return (state->ith < n_threads);
  2565. }
  2566. // check if thread is ready to proceed (exit from polling or sleeping)
  2567. static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
  2568. struct ggml_threadpool * threadpool = state->threadpool;
  2569. if (state->pending || threadpool->stop || threadpool->pause) { return true; }
  2570. // check for new graph/work
  2571. int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
  2572. if (new_graph != state->last_graph) {
  2573. state->pending = ggml_graph_compute_thread_active(state);
  2574. state->last_graph = new_graph;
  2575. }
  2576. return state->pending;
  2577. }
  2578. // sync thread state after polling
  2579. static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
  2580. // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
  2581. #ifdef GGML_TSAN_ENABLED
  2582. atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
  2583. #else
  2584. atomic_thread_fence(memory_order_seq_cst);
  2585. #endif
  2586. UNUSED(state);
  2587. }
  2588. static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
  2589. struct ggml_threadpool * threadpool = state->threadpool;
  2590. // Skip polling for unused threads
  2591. if (!ggml_graph_compute_thread_active(state)) {
  2592. return state->pending;
  2593. }
  2594. // This seems to make 0 ... 100 a decent range for polling level across modern processors.
  2595. // Perhaps, we can adjust it dynamically based on load and things.
  2596. const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
  2597. for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
  2598. // No new work. Keep polling.
  2599. ggml_thread_cpu_relax();
  2600. }
  2601. return state->pending;
  2602. }
  2603. static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
  2604. struct ggml_threadpool * threadpool = state->threadpool;
  2605. if (ggml_graph_compute_poll_for_work(state)) {
  2606. ggml_graph_compute_thread_sync(state);
  2607. return state->pending;
  2608. }
  2609. ggml_mutex_lock_shared(&threadpool->mutex);
  2610. while (!ggml_graph_compute_thread_ready(state)) {
  2611. // No new work. Wait for the signal.
  2612. GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
  2613. ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
  2614. }
  2615. ggml_mutex_unlock_shared(&threadpool->mutex);
  2616. return state->pending;
  2617. }
  2618. static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
  2619. struct ggml_compute_state * state = (struct ggml_compute_state *) data;
  2620. struct ggml_threadpool * threadpool = state->threadpool;
  2621. ggml_thread_apply_priority(threadpool->prio);
  2622. if (ggml_thread_cpumask_is_valid(state->cpumask)) {
  2623. ggml_thread_apply_affinity(state->cpumask);
  2624. }
  2625. while (true) {
  2626. // Check if we need to sleep
  2627. while (threadpool->pause) {
  2628. GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
  2629. ggml_mutex_lock_shared(&threadpool->mutex);
  2630. if (threadpool->pause) {
  2631. ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
  2632. }
  2633. GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
  2634. ggml_mutex_unlock_shared(&threadpool->mutex);
  2635. }
  2636. // This needs to be checked for after the cond_wait
  2637. if (threadpool->stop) break;
  2638. // Check if there is new work
  2639. // The main thread is the only one that can dispatch new work
  2640. ggml_graph_compute_check_for_work(state);
  2641. if (state->pending) {
  2642. state->pending = false;
  2643. ggml_graph_compute_thread(state);
  2644. }
  2645. }
  2646. return (thread_ret_t) 0;
  2647. }
  2648. // Start processing new graph
  2649. static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
  2650. {
  2651. // Always take the mutex here because the worker threads are doing hybrid poll/wait
  2652. ggml_mutex_lock(&threadpool->mutex);
  2653. GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
  2654. // Update the number of active threads
  2655. atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
  2656. // Indicate the graph is ready to be processed
  2657. // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
  2658. atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
  2659. if (threadpool->pause) {
  2660. // Update main thread prio and affinity to match the threadpool settings
  2661. ggml_thread_apply_priority(threadpool->prio);
  2662. if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
  2663. ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
  2664. }
  2665. // resume does cond broadcast
  2666. ggml_threadpool_resume_locked(threadpool);
  2667. } else {
  2668. ggml_cond_broadcast(&threadpool->cond);
  2669. }
  2670. ggml_mutex_unlock(&threadpool->mutex);
  2671. }
  2672. #endif // GGML_USE_OPENMP
  2673. static struct ggml_threadpool * ggml_threadpool_new_impl(
  2674. struct ggml_threadpool_params * tpp,
  2675. struct ggml_cgraph * cgraph,
  2676. struct ggml_cplan * cplan) {
  2677. struct ggml_threadpool * threadpool =
  2678. ggml_aligned_malloc(sizeof(struct ggml_threadpool));
  2679. {
  2680. threadpool->cgraph = cgraph;
  2681. threadpool->cplan = cplan;
  2682. threadpool->n_graph = 0;
  2683. threadpool->n_barrier = 0;
  2684. threadpool->n_barrier_passed = 0;
  2685. threadpool->current_chunk = 0;
  2686. threadpool->stop = false;
  2687. threadpool->pause = tpp->paused;
  2688. threadpool->abort = -1;
  2689. threadpool->workers = NULL;
  2690. threadpool->n_threads_max = tpp->n_threads;
  2691. threadpool->n_threads_cur = tpp->n_threads;
  2692. threadpool->poll = tpp->poll;
  2693. threadpool->prio = tpp->prio;
  2694. threadpool->ec = GGML_STATUS_SUCCESS;
  2695. }
  2696. // Allocate and init workers state
  2697. const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
  2698. struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
  2699. memset(workers, 0, workers_size);
  2700. for (int j = 0; j < tpp->n_threads; j++) {
  2701. workers[j].threadpool = threadpool;
  2702. workers[j].ith = j;
  2703. }
  2704. threadpool->workers = workers;
  2705. #ifdef GGML_USE_OPENMP
  2706. int32_t cpumask_iter = 0;
  2707. // Compute CPU masks for each thread
  2708. for (int j = 0; j < tpp->n_threads; j++) {
  2709. ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
  2710. }
  2711. #else // GGML_USE_OPENMP
  2712. ggml_mutex_init(&threadpool->mutex);
  2713. ggml_cond_init(&threadpool->cond);
  2714. // Spin the threads for all workers, and update CPU placements.
  2715. // Place the main thread last (towards the higher numbered CPU cores).
  2716. int32_t cpumask_iter = 0;
  2717. for (int j = 1; j < tpp->n_threads; j++) {
  2718. ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
  2719. int32_t rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_secondary_thread, &workers[j]);
  2720. GGML_ASSERT(rc == 0);
  2721. }
  2722. ggml_thread_cpumask_next(tpp->cpumask, workers[0].cpumask, tpp->strict_cpu, &cpumask_iter);
  2723. if (!threadpool->pause) {
  2724. // Update main thread prio and affinity at the start, otherwise we'll do it in resume
  2725. ggml_thread_apply_priority(threadpool->prio);
  2726. if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
  2727. ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
  2728. }
  2729. }
  2730. #endif // GGML_USE_OPENMP
  2731. return threadpool;
  2732. }
  2733. struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
  2734. return ggml_threadpool_new_impl(tpp, NULL, NULL);
  2735. }
  2736. enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
  2737. ggml_cpu_init();
  2738. GGML_ASSERT(cplan);
  2739. GGML_ASSERT(cplan->n_threads > 0);
  2740. GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
  2741. int n_threads = cplan->n_threads;
  2742. struct ggml_threadpool * threadpool = cplan->threadpool;
  2743. bool disposable_threadpool = false;
  2744. if (threadpool == NULL) {
  2745. //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
  2746. disposable_threadpool = true;
  2747. struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
  2748. threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
  2749. } else {
  2750. // Reset some of the parameters that need resetting
  2751. // No worker threads should be accessing the parameters below at this stage
  2752. threadpool->cgraph = cgraph;
  2753. threadpool->cplan = cplan;
  2754. threadpool->current_chunk = 0;
  2755. threadpool->abort = -1;
  2756. threadpool->ec = GGML_STATUS_SUCCESS;
  2757. }
  2758. #ifdef GGML_USE_OPENMP
  2759. if (n_threads > 1) {
  2760. #pragma omp parallel num_threads(n_threads)
  2761. {
  2762. #pragma omp single
  2763. {
  2764. // update the number of threads from the actual number of threads that we got from OpenMP
  2765. n_threads = omp_get_num_threads();
  2766. atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
  2767. }
  2768. // Apply thread CPU mask and priority
  2769. int ith = omp_get_thread_num();
  2770. ggml_thread_apply_priority(threadpool->prio);
  2771. if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
  2772. ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
  2773. }
  2774. ggml_graph_compute_thread(&threadpool->workers[ith]);
  2775. }
  2776. } else {
  2777. atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
  2778. ggml_graph_compute_thread(&threadpool->workers[0]);
  2779. }
  2780. #else
  2781. if (n_threads > threadpool->n_threads_max) {
  2782. GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
  2783. n_threads = threadpool->n_threads_max;
  2784. }
  2785. // Kick all threads to start the new graph
  2786. ggml_graph_compute_kickoff(threadpool, n_threads);
  2787. // This is a work thread too
  2788. ggml_graph_compute_thread(&threadpool->workers[0]);
  2789. #endif
  2790. // don't leave affinity set on the main thread
  2791. clear_numa_thread_affinity();
  2792. enum ggml_status ret = threadpool->ec;
  2793. if (disposable_threadpool) {
  2794. ggml_threadpool_free(threadpool);
  2795. }
  2796. return ret;
  2797. }
  2798. enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
  2799. struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
  2800. cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
  2801. return ggml_graph_compute(cgraph, &cplan);
  2802. }
  2803. void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
  2804. memcpy(y, x, n * sizeof(float));
  2805. }
  2806. void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
  2807. int64_t i = 0;
  2808. #if defined(__F16C__)
  2809. #if defined(__AVX512F__)
  2810. for (; i + 15 < n; i += 16) {
  2811. __m512 x_vec = _mm512_loadu_ps(x + i);
  2812. __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
  2813. _mm256_storeu_si256((__m256i *)(y + i), y_vec);
  2814. }
  2815. #endif
  2816. for (; i + 7 < n; i += 8) {
  2817. __m256 x_vec = _mm256_loadu_ps(x + i);
  2818. __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
  2819. _mm_storeu_si128((__m128i *)(y + i), y_vec);
  2820. }
  2821. for (; i + 3 < n; i += 4) {
  2822. __m128 x_vec = _mm_loadu_ps(x + i);
  2823. __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
  2824. _mm_storel_epi64((__m128i *)(y + i), y_vec);
  2825. }
  2826. #elif defined(__riscv_zvfh)
  2827. for (int vl; i < n; i += vl) {
  2828. vl = __riscv_vsetvl_e32m2(n - i);
  2829. vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
  2830. vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
  2831. __riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
  2832. }
  2833. #endif
  2834. for (; i < n; ++i) {
  2835. y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
  2836. }
  2837. }
  2838. void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
  2839. int64_t i = 0;
  2840. #if defined(__F16C__)
  2841. #if defined(__AVX512F__)
  2842. for (; i + 15 < n; i += 16) {
  2843. __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
  2844. __m512 y_vec = _mm512_cvtph_ps(x_vec);
  2845. _mm512_storeu_ps(y + i, y_vec);
  2846. }
  2847. #endif
  2848. for (; i + 7 < n; i += 8) {
  2849. __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
  2850. __m256 y_vec = _mm256_cvtph_ps(x_vec);
  2851. _mm256_storeu_ps(y + i, y_vec);
  2852. }
  2853. for (; i + 3 < n; i += 4) {
  2854. __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
  2855. __m128 y_vec = _mm_cvtph_ps(x_vec);
  2856. _mm_storeu_ps(y + i, y_vec);
  2857. }
  2858. #endif
  2859. for (; i < n; ++i) {
  2860. y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
  2861. }
  2862. }
  2863. void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
  2864. int64_t i = 0;
  2865. for (; i < n; ++i) {
  2866. y[i] = GGML_FP32_TO_BF16(x[i]);
  2867. }
  2868. }
  2869. void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
  2870. int64_t i = 0;
  2871. for (; i < n; ++i) {
  2872. y[i] = x[i];
  2873. }
  2874. }
  2875. void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
  2876. int64_t i = 0;
  2877. #if defined(__AVX2__)
  2878. #if defined(__AVX512F__)
  2879. for (; i + 15 < n; i += 16) {
  2880. _mm512_storeu_ps(y + i,
  2881. _mm512_castsi512_ps(
  2882. _mm512_slli_epi32(
  2883. _mm512_cvtepu16_epi32(
  2884. _mm256_loadu_si256(
  2885. (const __m256i *)(x + i))),
  2886. 16)));
  2887. }
  2888. #endif
  2889. for (; i + 7 < n; i += 8) {
  2890. _mm256_storeu_ps(y + i,
  2891. _mm256_castsi256_ps(
  2892. _mm256_slli_epi32(
  2893. _mm256_cvtepu16_epi32(
  2894. _mm_loadu_si128(
  2895. (const __m128i *)(x + i))),
  2896. 16)));
  2897. }
  2898. #endif
  2899. for (; i < n; i++) {
  2900. y[i] = GGML_BF16_TO_FP32(x[i]);
  2901. }
  2902. }
  2903. int ggml_cpu_has_avx(void) {
  2904. #if defined(__AVX__)
  2905. return 1;
  2906. #else
  2907. return 0;
  2908. #endif
  2909. }
  2910. int ggml_cpu_has_avx_vnni(void) {
  2911. #if defined(__AVXVNNI__)
  2912. return 1;
  2913. #else
  2914. return 0;
  2915. #endif
  2916. }
  2917. int ggml_cpu_has_avx2(void) {
  2918. #if defined(__AVX2__)
  2919. return 1;
  2920. #else
  2921. return 0;
  2922. #endif
  2923. }
  2924. int ggml_cpu_has_avx512(void) {
  2925. #if defined(__AVX512F__)
  2926. return 1;
  2927. #else
  2928. return 0;
  2929. #endif
  2930. }
  2931. int ggml_cpu_has_avx512_vbmi(void) {
  2932. #if defined(__AVX512VBMI__)
  2933. return 1;
  2934. #else
  2935. return 0;
  2936. #endif
  2937. }
  2938. int ggml_cpu_has_avx512_vnni(void) {
  2939. #if defined(__AVX512VNNI__)
  2940. return 1;
  2941. #else
  2942. return 0;
  2943. #endif
  2944. }
  2945. int ggml_cpu_has_avx512_bf16(void) {
  2946. #if defined(__AVX512BF16__)
  2947. return 1;
  2948. #else
  2949. return 0;
  2950. #endif
  2951. }
  2952. int ggml_cpu_has_amx_int8(void) {
  2953. #if defined(__AMX_INT8__)
  2954. return 1;
  2955. #else
  2956. return 0;
  2957. #endif
  2958. }
  2959. int ggml_cpu_has_bmi2(void) {
  2960. #if defined(__BMI2__)
  2961. return 1;
  2962. #else
  2963. return 0;
  2964. #endif
  2965. }
  2966. int ggml_cpu_has_fma(void) {
  2967. #if defined(__FMA__)
  2968. return 1;
  2969. #else
  2970. return 0;
  2971. #endif
  2972. }
  2973. int ggml_cpu_has_arm_fma(void) {
  2974. #if defined(__ARM_FEATURE_FMA)
  2975. return 1;
  2976. #else
  2977. return 0;
  2978. #endif
  2979. }
  2980. int ggml_cpu_has_riscv_v(void) {
  2981. #if defined(__riscv_v_intrinsic)
  2982. return 1;
  2983. #else
  2984. return 0;
  2985. #endif
  2986. }
  2987. int ggml_cpu_has_f16c(void) {
  2988. #if defined(__F16C__)
  2989. return 1;
  2990. #else
  2991. return 0;
  2992. #endif
  2993. }
  2994. int ggml_cpu_has_fp16_va(void) {
  2995. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
  2996. return 1;
  2997. #else
  2998. return 0;
  2999. #endif
  3000. }
  3001. int ggml_cpu_has_wasm_simd(void) {
  3002. #if defined(__wasm_simd128__)
  3003. return 1;
  3004. #else
  3005. return 0;
  3006. #endif
  3007. }
  3008. int ggml_cpu_has_llamafile(void) {
  3009. #if defined(GGML_USE_LLAMAFILE)
  3010. return 1;
  3011. #else
  3012. return 0;
  3013. #endif
  3014. }
  3015. int ggml_cpu_has_sse3(void) {
  3016. #if defined(__SSE3__)
  3017. return 1;
  3018. #else
  3019. return 0;
  3020. #endif
  3021. }
  3022. int ggml_cpu_has_ssse3(void) {
  3023. #if defined(__SSSE3__)
  3024. return 1;
  3025. #else
  3026. return 0;
  3027. #endif
  3028. }
  3029. int ggml_cpu_has_vsx(void) {
  3030. #if defined(__POWER9_VECTOR__)
  3031. return 1;
  3032. #else
  3033. return 0;
  3034. #endif
  3035. }
  3036. int ggml_cpu_has_vxe(void) {
  3037. #if defined(__VXE__) || defined(__VXE2__)
  3038. return 1;
  3039. #else
  3040. return 0;
  3041. #endif
  3042. }
  3043. int ggml_cpu_has_neon(void) {
  3044. #if defined(__ARM_ARCH) && defined(__ARM_NEON)
  3045. return 1;
  3046. #else
  3047. return 0;
  3048. #endif
  3049. }
  3050. int ggml_cpu_has_dotprod(void) {
  3051. #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
  3052. return 1;
  3053. #else
  3054. return 0;
  3055. #endif
  3056. }
  3057. int ggml_cpu_has_sve(void) {
  3058. #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
  3059. return 1;
  3060. #else
  3061. return 0;
  3062. #endif
  3063. }
  3064. int ggml_cpu_has_matmul_int8(void) {
  3065. #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
  3066. return 1;
  3067. #else
  3068. return 0;
  3069. #endif
  3070. }
  3071. int ggml_cpu_get_sve_cnt(void) {
  3072. #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
  3073. return ggml_arm_arch_features.sve_cnt;
  3074. #else
  3075. return 0;
  3076. #endif
  3077. }
  3078. int ggml_cpu_has_sme(void) {
  3079. #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
  3080. return 1;
  3081. #else
  3082. return 0;
  3083. #endif
  3084. }
  3085. void ggml_cpu_init(void) {
  3086. // needed to initialize ggml_time
  3087. {
  3088. struct ggml_init_params params = { 0, NULL, false };
  3089. struct ggml_context * ctx = ggml_init(params);
  3090. ggml_free(ctx);
  3091. }
  3092. ggml_critical_section_start();
  3093. static bool is_first_call = true;
  3094. if (is_first_call) {
  3095. // initialize GELU, Quick GELU, SILU and EXP F32 tables
  3096. {
  3097. const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
  3098. for (int i = 0; i < (1 << 16); ++i) {
  3099. union {
  3100. uint16_t u16;
  3101. ggml_fp16_t fp16;
  3102. } u = {i};
  3103. float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
  3104. ggml_table_f32_f16[i] = f;
  3105. ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
  3106. ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
  3107. }
  3108. const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
  3109. GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
  3110. #ifdef GGML_USE_OPENMP
  3111. //if (!getenv("OMP_WAIT_POLICY")) {
  3112. // // set the wait policy to active, so that OpenMP threads don't sleep
  3113. // putenv("OMP_WAIT_POLICY=active");
  3114. //}
  3115. if (!getenv("KMP_BLOCKTIME")) {
  3116. // set the time to wait before sleeping a thread
  3117. // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
  3118. putenv("KMP_BLOCKTIME=200"); // 200ms
  3119. }
  3120. #endif
  3121. }
  3122. #if defined(__ARM_ARCH)
  3123. ggml_init_arm_arch_features();
  3124. #endif
  3125. is_first_call = false;
  3126. }
  3127. ggml_critical_section_end();
  3128. }